Assess the presence and abundance of Gardnerella spp. and their impacts on the vaginal microbiomes of pregnant women from three distinct cohorts.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.5 ✔ dplyr 1.0.7
## ✔ tidyr 1.1.4 ✔ stringr 1.4.0
## ✔ readr 2.0.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(broom)
library(formattable)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
library(ggbeeswarm)
library(ggExtra)
library(coin)
## Loading required package: survival
library(cowplot)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following objects are masked from 'package:coin':
##
## chisq_test, friedman_test, kruskal_test, sign_test, wilcox_test
## The following object is masked from 'package:stats':
##
## filter
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
##
## get_legend
`%!in%` <- negate(`%in%`)
set.seed(7334)
figureOut <- "../FIGURES/submission_manuscript_figures"
tmpFigureOut <- "../FIGURES/submission_manuscript_figures/tmp" # folder for storing temp files for figure formatting
suDF <- "./sumgsDF.tsv" %>% # metadata for Stanford and UAB
read_tsv %>%
mutate(SampleID=as.character(SampleID))
hmp2DF <- "./hmp2mgsDF.tsv" %>%
read_tsv %>%
mutate(SampleID=as.character(SampleID),
Cohort="MOMS-PI")
# sample data
sgDF0 <- suDF %>%
select("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs") %>%
mutate(Cohort=paste(Cohort, "Enriched")) %>%
full_join(hmp2DF[,c("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs")], by = c("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs")) %>%
mutate(Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI")), # Stanford and UAB samples were selected for enrichment in Gardnerella, label them has Enriched.
SampleID=as.character(SampleID),
SubjectID=as.character(SubjectID),
pctHuman=((Sickle_pairs-bbmapFiltered_pairs)/Sickle_pairs)*100, # calculate percent human reads
bacLoad=bbmapFiltered_pairs/(Sickle_pairs-bbmapFiltered_pairs)) # calculate microbial load
Taxa abundance tables Stanford and UAB cohorts:
suMetaphlanPath <- "../humann2/merged_tables/mergedMetaphlanAbundance.tsv"
suGardCladePath <- "./gardCladeRelativeAbundance.tsv"
suGardGenomoPath <- "./gardGenomospeciesRelativeAbundance.tsv"
MOMS-PI (HMP2) cohort
hmp2MetaphlanPath <- "../humann2/merged_tables/hmp2mgs_mergedMetaphlanAbundance.tsv"
hmp2GardCladePath <- "./hmp2gardCladeRelativeAbundance.tsv"
hmp2GardGenomoPath <- "./hmp2gardGenomospeciesRelativeAbundance.tsv"
Themes:
theme_set(theme_bw()+
theme(panel.grid = element_blank(),
axis.text = element_text(size=10),
axis.title = element_text(size=15),
legend.text = element_text(size=12),
legend.title = element_text(size=14)))
Clade colors:
cladeColors <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73"), labels=c("C1", "C2", "C3", "C4", "C5", "C6")),
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73"), labels=c("C1", "C2", "C3", "C4", "C5", "C6")))
cladeColorsUnch <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)))),
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)))))
cladeColorsTotal <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Total'~italic(Gardnerella)))),
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Total'~italic(Gardnerella)))))
cladeColorsTotalUnch <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)), bquote('Total'~italic(Gardnerella)))),
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)), bquote('Total'~italic(Gardnerella)))))
genomoColorsTotal <- list(scale_color_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#000000"),
labels=c(bquote(italic("G. vaginalis")),
"GS2",
"GS3",
bquote(italic("G. piotti")),
"GS11",
"GS7",
"GS8",
"GS9",
"GS10",
"GS14",
bquote(italic("G. leopoldii")),
bquote(italic("G. swidsinskii")),
"GS12",
"GS13",
bquote('Total'~italic(Gardnerella)))),
scale_fill_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#000000"),
labels=c(bquote(italic("G. vaginalis")),
"GS2",
"GS3",
bquote(italic("G. piotti")),
"GS11",
"GS7",
"GS8",
"GS9",
"GS10",
"GS14",
bquote(italic("G. leopoldii")),
bquote(italic("G. swidsinskii")),
"GS12",
"GS13",
bquote('Total'~italic(Gardnerella)))))
genomoColorsTotalUnch <- list(scale_color_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#A9A9A9", "#000000"),
labels=c(bquote(italic("G. vaginalis")),
"GS2",
"GS3",
bquote(italic("G. piotti")),
"GS11",
"GS7",
"GS8",
"GS9",
"GS10",
"GS14",
bquote(italic("G. leopoldii")),
bquote(italic("G. swidsinskii")),
"GS12",
"GS13",
bquote('Uncharacterized'~italic(Gardnerella)),
bquote('Total'~italic(Gardnerella)))),
scale_fill_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#A9A9A9", "#000000"),
labels=c(bquote(italic("G. vaginalis")),
"GS2",
"GS3",
bquote(italic("G. piotti")),
"GS11",
"GS7",
"GS8",
"GS9",
"GS10",
"GS14",
bquote(italic("G. leopoldii")),
bquote(italic("G. swidsinskii")),
"GS12",
"GS13",
bquote('Uncharacterized'~italic(Gardnerella)),
bquote('Total'~italic(Gardnerella)))))
Gardnerella and Lactobacillus Colors:
gLColors <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73","darkgreen", "khaki1", "darkblue", "darkred"), labels=c("G. vaginalis C1", "G. vaginalis C2", "G. vaginalis C3", "G. vaginalis C4", "G. vaginalis C5", "G. vaginalis C6", "L. crispatus", "L. gasseri", "L. iners", "L. jensenii")),
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73","darkgreen", "khaki1", "darkblue", "darkred", "darkseagreen4"), labels=c("G. vaginalis C1", "G. vaginalis C2", "G. vaginalis C3", "G. vaginalis C4", "G. vaginalis C5", "G. vaginalis C6", "L. crispatus", "L. gasseri", "L. iners", "L. jensenii")))
All Taxa Colors:
taxColors <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
"darkgreen", "khaki1", "darkred", "darkblue",
"mediumpurple4", "#C42104", "darkseagreen4", "#2004C2"),
labels=c("C1", "C2", "C3", "C4", "C5", "C6",
bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
)),
scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
"darkgreen", "khaki1", "darkred", "darkblue",
"mediumpurple4", "#C42104", "darkseagreen4", "#2004C2"),
labels=c("C1", "C2", "C3", "C4", "C5", "C6",
bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
)))
taxColorsOther <- list(scale_color_manual(values=c("#808080", "#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
"darkgreen", "khaki1", "darkred", "darkblue",
"mediumpurple4", "#C42104", "darkseagreen4", "#2004C2"),
labels=c("Other", "C1", "C2", "C3", "C4", "C5", "C6",
bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
)),
scale_fill_manual(values=c("#808080", "#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
"darkgreen", "khaki1", "darkred", "darkblue",
"mediumpurple4", "#C42104", "darkseagreen4", "#2004C2"),
labels=c("Other", "C1", "C2", "C3", "C4", "C5", "C6",
bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
)))
Bi/Tri/Quad Colors:
binaryColors <- list(scale_color_manual(values = c("#D55E00", "#56B4E9")),
scale_fill_manual(values=c("#D55E00", "#56B4E9")))
trinaryColors <- list(scale_color_manual(values = c("#F77754", "#018790", "#0A516D")),
scale_fill_manual(values = c(c("#F77754", "#018790", "#0A516D"))))
quadColors <- list(scale_color_manual(values = c("#F77754", "#018790", "#0A516D", "#2B2726")),
scale_fill_manual(values = c(c("#F77754", "#018790", "#0A516D", "#2B2726"))))
quadColors2 <- list(scale_color_manual(values = c("#D55E00", "#009E73", "#CC79A7", "#56B4E9")),
scale_fill_manual(values=c("#D55E00","#009E73", "#CC79A7", "#56B4E9")))
Create some shortcut lists for convenience.
## Other shortcuts
clades <- c("C1", "C2", "C3", "C4", "C5", "C6")
genomos <- c("GS1", "GS2", "GS3", "GS4", "GS5", "GS6", "GS7", "GS8", "GS9", "GS10", "GS11", "GS12", "GS13", "GS14")
genomoNames <- c("G. vag.", "GS2", "GS3", "G. pio.", "G. leo.", "G. swi.", "GS7", "GS8", "GS9", "GS10", "GS11", "GS12", "GS13", "GS14")
genomosCladeOrder <- c("GS1", "GS2", "GS3", "GS4", "GS11", "GS7", "GS8", "GS9", "GS10", "GS14", "GS5", "GS6", "GS12", "GS13")
genomoNamesCladeOrder <- c("G. vag.", "GS2", "GS3", "G. pio.", "GS11", "GS7", "GS8", "GS9", "GS10", "GS14", "G. leo.", "G. swi.", "GS12", "GS13")
lactos <- c("Lactobacillus_crispatus", "Lactobacillus_gasseri", "Lactobacillus_jensenii", "Lactobacillus_iners")
anaerobes <- c("Atopobium_vaginae", "Finegoldia_magna", "Mycoplasma_hominis", "Megasphaera_genomosp_type_1", "Megasphaera_unclassified", "Prevotella_amnii", "Prevotella_bivia", "Prevotella_timonensis", "Ureaplasma_parvum")
anaerobesSL <- c("Atopobium_vaginae", "Mycoplasma_hominis", "Prevotella_bivia", "Ureaplasma_parvum") # short list of anaerobes
abbrevs <- c("TG", clades, "Lc", "Lg", "Lj", "Li", "Av", "Fm", "Mh", "Meg1", "Megu", "Pa", "Pb", "Pt", "Up")
abbrevsSL <- c("TG", clades, "Lc", "Lg", "Lj", "Li", "Av", "Mh", "Pb", "Up") # short list of abbreviations
cohorts <- c("Stanford Enriched", "UAB Enriched", "MOMS-PI Enriched", "MOMS-PI")
cohortAbbrevs <- c("Stanford Enr.", "UAB Enr.", "MOMS-PI Enr.", "MOMS-PI")
Starting number of samples and subjects:
sgDF0 %>%
group_by(Cohort) %>%
summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
formattable()
| Cohort | N Subjects | N Samples |
|---|---|---|
| Stanford Enriched | 20 | 62 |
| UAB Enriched | 15 | 45 |
| MOMS-PI | 234 | 930 |
Paired reads in total library size, after quality filtering by Sickle, and after removing human reads.
sgDF0 %>%
gather("Step", "paired_reads", c(TotalPairs, Sickle_pairs, bbmapFiltered_pairs)) %>%
mutate(Step=factor(Step, levels=c("TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs"), labels=c("Library Size", "Sickle Filtered", "Microbial Reads"))) %>%
group_by(Step, Cohort) %>%
summarise(min=min(paired_reads), mean=mean(paired_reads), max=max(paired_reads)) %>%
mutate_if(is.numeric, ~prettyNum(.x, big.mark = ",")) %>%
formattable()
| Step | Cohort | min | mean | max |
|---|---|---|---|---|
| Library Size | Stanford Enriched | 14,655,421 | 19,681,739 | 29,263,108 |
| Library Size | UAB Enriched | 13,081,280 | 19,239,013 | 35,557,864 |
| Library Size | MOMS-PI | 12,210 | 10,162,162 | 89,647,588 |
| Sickle Filtered | Stanford Enriched | 12,757,737 | 17,122,069 | 24,968,647 |
| Sickle Filtered | UAB Enriched | 11,695,954 | 16,964,926 | 30,690,425 |
| Sickle Filtered | MOMS-PI | 0 | 6,877,062 | 78,549,522 |
| Microbial Reads | Stanford Enriched | 45,287 | 2,108,541 | 17,576,674 |
| Microbial Reads | UAB Enriched | 81,037 | 3,811,026 | 15,652,482 |
| Microbial Reads | MOMS-PI | 0 | 669,755.7 | 30,053,898 |
Percent filtered at each step
sgDF0 %>%
mutate(`Sickle % Total`=Sickle_pairs/TotalPairs,
`BBmap % Sickle`=bbmapFiltered_pairs/Sickle_pairs,
`BBmap % Total`=bbmapFiltered_pairs/TotalPairs) %>%
group_by(Cohort) %>%
summarise_at(c("Sickle % Total", "BBmap % Sickle", "BBmap % Total"), ~median(.x, na.rm=TRUE)) %>%
formattable()
| Cohort | Sickle % Total | BBmap % Sickle | BBmap % Total |
|---|---|---|---|
| Stanford Enriched | 0.8685448 | 0.04800637 | 0.04134656 |
| UAB Enriched | 0.8793049 | 0.20023969 | 0.17588516 |
| MOMS-PI | 0.7215741 | 0.04755454 | 0.02472847 |
sickle_pct_total = percent of paired reads left after sickle filtering bbmap_pct_sicke = percent of sickle paired reads left after removing human reads bbmap_pct_total = percent of all paired reads left after sickle filtering and removing human reads
View number of paired reads after sickle and human filtering per sample and determine filtering threshold.
p <- sgDF0 %>%
ggplot(aes(x=Sickle_pairs, y=bbmapFiltered_pairs, fill=Cohort, color=Cohort)) +
geom_point(alpha=0.25) +
trinaryColors +
geom_vline(xintercept = 1e6, color="grey", linetype="dashed") + # 1 million paired reads after sickle
scale_x_log10(n.breaks=10) +
scale_y_log10(n.breaks=10) +
theme_bw() +
theme(legend.position = "bottom",
axis.text = element_text(size=10),
axis.title = element_text(size=15),
legend.text = element_text(size=12),
legend.title = element_text(size=14)) +
labs(x="QC Filtered Paired Reads", y="Microbial Paired Reads")
ggMarginal(p, groupFill = TRUE, type="density")
Remove the samples with fewer than 1 million paired reads after Sickle filtering and trimming.
QCfilterSamples <- sgDF0 %>%
filter(Sickle_pairs>1e6) %>% # list of samples that pass QC filtering criteria and will be used in analyses
.$SampleID
length(QCfilterSamples) # number of samples that will be used in analyses
## [1] 888
sgDF1 <- sgDF0 %>%
filter(SampleID %in% QCfilterSamples)
Number of samples remaining in each cohort:
sgDF1 %>%
group_by(Cohort) %>%
summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
formattable()
| Cohort | N Subjects | N Samples |
|---|---|---|
| Stanford Enriched | 20 | 62 |
| UAB Enriched | 15 | 45 |
| MOMS-PI | 231 | 781 |
Will proceed with a total of 888 samples, 62 from Stanford, 45 from UAB, and 781 from MOMS-PI
Library sizes and percent filtered at each step post QC sample removal:
readCountsPlot <- sgDF1 %>%
gather("Step", "paired_reads", c(TotalPairs, Sickle_pairs, bbmapFiltered_pairs)) %>%
mutate(Step=factor(Step, levels=c("TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs"), labels=c("Library Size", "Sickle Filtered", "Microbial Reads")),
Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI"), labels=c("Sanford\nEnriched", "UAB\nEnriched", "MOMS-PI"))) %>%
ggplot(aes(x=Cohort, y=paired_reads, fill=Cohort, color=Cohort)) +
geom_quasirandom(dodge.width = 0.9, alpha=0.5, size=1, show.legend = FALSE) +
geom_boxplot(alpha=0, position=position_dodge(width=0.9), color="black", show.legend = FALSE) +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
facet_wrap(~Step) +
trinaryColors +
theme_bw() +
theme(axis.text.x = element_text(size=10),
axis.text.y=element_text(size=13),
strip.text = element_text(size=13),
axis.title = element_text(size=16),
aspect.ratio = 1.5) +
labs(x=NULL, y="Paired Reads")
percentHumanHists <- sgDF1 %>%
group_by(Cohort) %>%
ggplot(aes(pctHuman)) +
geom_histogram(aes(y=stat(width*density))) +
scale_y_continuous(limits = c(0,0.4), n.breaks=9) +
facet_wrap(~Cohort) +
theme(axis.text = element_text(size=13),
axis.title = element_text(size=16),
strip.text = element_text(size=13),
aspect.ratio = 1.5) +
labs(x="Percent Human Reads", y="Proportion of Samples")
MOMS-PI samples had greatest percent of human reads overall. They were not selected for enrichment in Gardnerella. Samples in UAB cohort had the smallest percent of human reads.
readsTable <- sgDF1 %>%
group_by(Cohort) %>%
summarise_at(vars(TotalPairs, Sickle_pairs, bbmapFiltered_pairs, pctHuman), list(mean=mean, sd=sd)) %>%
mutate_at(vars(pctHuman_mean, pctHuman_sd), ~round(.x, 2)) %>%
mutate_if(is.numeric, ~prettyNum(.x, big.mark = ",")) %>%
transmute(Cohort=Cohort,
`Library Size`=paste0(TotalPairs_mean, " (", TotalPairs_sd, ")"),
`Sickle Filtered`=paste0(Sickle_pairs_mean, " (", Sickle_pairs_sd, ")"),
`Microbial Reads`=paste0(bbmapFiltered_pairs_mean, " (", bbmapFiltered_pairs_sd, ")"),
`Percent Human Reads`=paste0(pctHuman_mean, " (", pctHuman_sd, ")")) %>%
kbl() %>%
kable_classic(full_width=TRUE, html_font = "Arial")
#save table to tmp file for adding to rest of read count figure
readsTablePngPath <- file.path(tmpFigureOut, paste(Sys.Date(), "readsTableComponent.png", sep="_"))
readsTable %>%
save_kable(file = readsTablePngPath, zoom=4)
readsAB <- plot_grid(readCountsPlot, percentHumanHists, nrow = 1, align = "hv", axis="tb", labels = c("A", "B"), label_size = 15)
printReadsTable <- ggdraw() + draw_image(readsTablePngPath)
plot_grid(readsAB, printReadsTable, nrow=2, rel_heights = c(1, 0.25), align = "v", axis = "l", labels = c("", "C"), label_size = 15)
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS3_ReadCounts.png", sep="_")))
Description of cohorts and sample collection schemes
samplingSched <- sgDF1 %>%
filter(!is.na(GDcoll)) %>%
mutate(GWcoll=GDcoll/7,
GWcoll=floor(GWcoll),
Cohort=factor(Cohort, levels=c("Stanford Enriched", "UAB Enriched", "MOMS-PI"))) %>%
split(.$Cohort) %>%
map(., ~ggplot(.x) +
geom_point(aes(x=ceiling(GWcoll), y=SubjectID), size=1, alpha=0.5) +
geom_point(aes(x=(GWdel+0.5), y=SubjectID), shape=")") +
facet_grid(.~Cohort) +
theme(axis.text = element_text(size=13),
axis.title = element_text(size=16),
strip.text = element_text(size=13),
aspect.ratio = 1.5) +
labs(x=NULL, y=NULL))
stanfordSamplingSched <- samplingSched$`Stanford Enriched` +
labs(y="Subject ID")
uabSamplingSched <- samplingSched$`UAB Enriched` +
labs(x="Gestation Weeks")
momspiSamplingSched <- samplingSched$`MOMS-PI` +
theme(axis.text.y = element_text(size=3))
plot_grid(stanfordSamplingSched, uabSamplingSched, momspiSamplingSched, rel_widths = c(1, 1, 1), rel_heights = c(1, 1, 1), align = "hv", nrow = 1)
#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS2_samplingSchedule.png", sep="_")))
sgDF1 %>%
filter(!is.na(GDcoll)) %>%
mutate(GWcoll=GDcoll/7,
GWcoll=floor(GWcoll)) %>%
group_by(Cohort) %>%
summarize(across(GWcoll, list(median=median, `25th`=~quantile(.x, 0.25), `75th`=~quantile(.x, 0.75))))
## # A tibble: 3 × 4
## Cohort GWcoll_median GWcoll_25th GWcoll_75th
## <fct> <dbl> <dbl> <dbl>
## 1 Stanford Enriched 22 16 31
## 2 UAB Enriched 26 19 32
## 3 MOMS-PI 29 20 36
hmp2SubjectsWDel <- sgDF1 %>%
filter(Cohort=="MOMS-PI",
!is.na(TermPre)) %>%
.$SubjectID %>%
unique
head(hmp2SubjectsWDel)
## [1] "2442683" "2442687" "2442892" "2442905" "2442618" "2443013"
length(hmp2SubjectsWDel)
## [1] 133
#GAAD
gaadDF <- sgDF1 %>%
select(SubjectID, Cohort, TermPre, GWdel) %>%
unique() %>%
add_count(Cohort, TermPre) %>%
group_by(Cohort, TermPre, n) %>%
summarize(mean=round(mean(GWdel, na.rm=TRUE), 1),
sd=round(sd(GWdel, na.rm = TRUE), 1)) %>%
replace_na(list(mean="-",
sd="-")) %>%
ungroup() %>%
mutate(TermPre=case_when(TermPre=="T"~"Term",
TermPre=="P"~"Preterm",
is.na(TermPre)~"Unknown"),
cat=paste0(Cohort, "\n", TermPre, " (n=", n, ")"),
`Mean gestational age in weeks at delivery (±SD)`=paste0(mean, " (", sd, ")")) %>%
replace_na(list(`Mean gestational age in weeks at delivery (±SD)`="-")) %>%
select(cat, `Mean gestational age in weeks at delivery (±SD)`) %>%
column_to_rownames(var="cat") %>%
t %>%
as.data.frame %>%
rownames_to_column(var="x") %>%
select("x", "Stanford Enriched\nTerm (n=11)", "Stanford Enriched\nPreterm (n=9)", "UAB Enriched\nTerm (n=5)", "UAB Enriched\nPreterm (n=10)", "MOMS-PI\nTerm (n=90)", "MOMS-PI\nPreterm (n=43)", "MOMS-PI\nUnknown (n=98)")
# Demographics
demDF <- sgDF1 %>%
select(SubjectID, Cohort, TermPre, Age, Race, Ethnicity, Education, Income, deliveryMode) %>%
unique() %>%
mutate(Cohort=as.character(Cohort)) %>%
add_count(Cohort, TermPre, name="term_count") %>%
mutate(TermPre=case_when(TermPre=="T"~"Term",
TermPre=="P"~"Preterm",
is.na(TermPre)~"Unknown"),
cat=paste0(Cohort, "\n", TermPre, " (n=", term_count, ")")) %>%
select(-SubjectID, -TermPre, -Cohort, -term_count) %>%
map2_df(., data.frame(matrix(rep(.$cat, 7), ncol=7)), ~count(data.frame(x=.x, y=.y), x, y), .id="z") %>%
filter(z!="cat") %>%
group_by(z,y) %>%
mutate(n=paste0(n, " (", round(n/sum(n)*100, 0), "%)")) %>%
ungroup %>%
spread(y, n) %>%
mutate_all(~replace_na(.x, "0 (0%)")) %>%
mutate(x=paste(x,z, sep="_"), #add variable category for arranging purposes because "unknown" appears multiple times and therefore causes an error when making x a factor class.
x=factor(x, c("Below 18_Age", "18 to 28_Age", "29 to 38_Age", "Above 38_Age", "Unknown_Age", "Asian_Race", "Black_Race", "White_Race", "Other_Race", "Hispanic_Ethnicity", "Non-Hispanic_Ethnicity", "Unknown_Ethnicity", "Less than high school_Education", "High school diploma or GED_Education", "Some college_Education", "Bachelor or undergraduate degree_Education", "Post-undergraduate degree_Education", "Unknown_Education", "No income_Income", "Under $10,000_Income", "$10,000 - $30,000_Income", "$30,000 - $50,000_Income", "$50,000 - $60,000_Income", "$80,000 - $100,000_Income", "$150,000 - $200,000_Income", "$250,000 or more_Income", "Under $15,000_Income", "$15,000 - $19,999_Income", "$20,000 - $39,999_Income", "$40,000 - $59,999_Income", "$60,000 - $79,999_Income", "$80,000 or more_Income", "Unknown_Income", "Vaginal_deliveryMode", "Cesarean_deliveryMode", "Unknown_deliveryMode"))) %>%
arrange(x) %>% # order rows
mutate(x=str_extract(x, ".*(?=_)")) %>% # remove category label now that rows are arranged
mutate_at(c("Stanford Enriched\nTerm (n=11)", "Stanford Enriched\nPreterm (n=9)", "UAB Enriched\nTerm (n=5)", "UAB Enriched\nPreterm (n=10)"), # remove 0's from income rows where they don't belong due to different levels in Stanford/UAB vs. MOMS-PI for income
~case_when(x %in% c("Under $15,000", "$15,000 - $19,999", "$20,000 - $39,999", "$40,000 - $59,999", "$60,000 - $79,999", "$80,000 or more")~"-",
x %!in% c("Under $15,000", "$15,000 - $19,999", "$20,000 - $39,999", "$40,000 - $59,999", "$60,000 - $79,999", "$80,000 or more")~.x)) %>%
mutate_at(c("MOMS-PI\nTerm (n=90)", "MOMS-PI\nPreterm (n=43)", "MOMS-PI\nUnknown (n=98)"), # remove 0's from income rows where they don't belong due to different levels in Stanford/UAB vs. MOMS-PI for income
~case_when(x %in% c("No income", "Under $10,000", "$10,000 - $30,000", "$30,000 - $50,000", "$50,000 - $60,000", "$80,000 - $100,000", "$150,000 - $200,000", "$250,000 or more")~"-",
x %!in% c("No income", "Under $10,000", "$10,000 - $30,000", "$30,000 - $50,000", "$50,000 - $60,000", "$80,000 - $100,000", "$150,000 - $200,000", "$250,000 or more")~.x)) %>%
select(x, `Stanford Enriched\nTerm (n=11)`, `Stanford Enriched\nPreterm (n=9)`, `UAB Enriched\nTerm (n=5)`, `UAB Enriched\nPreterm (n=10)`, `MOMS-PI\nTerm (n=90)`, `MOMS-PI\nPreterm (n=43)`, `MOMS-PI\nUnknown (n=98)`) # order columns
demTable <- gaadDF %>%
rbind(demDF) %>%
dplyr::rename(" "=x) %>%
dplyr::rename_at(2:8, ~str_extract(.x, "(?<=\\n).*")) %>%
kbl(caption = "Table 2. Sampling Cohorts") %>%
kable_classic(full_width=TRUE, html_font = "Arial") %>%
add_header_above(c(" ", "Stanford Enriched" = 2, "UAB Enriched" = 2, "MOMS-PI" = 3)) %>%
pack_rows("Age", 2, 6) %>%
pack_rows("Race", 7, 10) %>%
pack_rows("Ethnicity", 11, 13) %>%
pack_rows("Education", 14, 19) %>%
pack_rows("Income", 20, 34) %>%
pack_rows("Delivery Mode", 35, 37)
demTable
|
Stanford Enriched
|
UAB Enriched
|
MOMS-PI
|
|||||
|---|---|---|---|---|---|---|---|
| Term (n=11) | Preterm (n=9) | Term (n=5) | Preterm (n=10) | Term (n=90) | Preterm (n=43) | Unknown (n=98) | |
| Mean gestational age in weeks at delivery (±SD) | 39.2 (1.7) | 33.9 (2.3) | 37.8 (1.3) | 32.7 (3.4) | 40 (0.7) | 34.1 (3.4) |
|
| Age | |||||||
| Below 18 | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 2 (2%) | 1 (2%) | 1 (1%) |
| 18 to 28 | 2 (18%) | 0 (0%) | 4 (80%) | 6 (60%) | 57 (63%) | 28 (65%) | 54 (55%) |
| 29 to 38 | 4 (36%) | 6 (67%) | 1 (20%) | 3 (30%) | 27 (30%) | 12 (28%) | 39 (40%) |
| Above 38 | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 4 (4%) | 2 (5%) | 2 (2%) |
| Unknown | 5 (45%) | 3 (33%) | 0 (0%) | 1 (10%) | 0 (0%) | 0 (0%) | 2 (2%) |
| Race | |||||||
| Asian | 2 (18%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 2 (2%) |
| Black | 0 (0%) | 0 (0%) | 3 (60%) | 8 (80%) | 70 (78%) | 30 (70%) | 56 (57%) |
| White | 6 (55%) | 4 (44%) | 0 (0%) | 1 (10%) | 14 (16%) | 6 (14%) | 30 (31%) |
| Other | 3 (27%) | 5 (56%) | 2 (40%) | 1 (10%) | 6 (7%) | 7 (16%) | 10 (10%) |
| Ethnicity | |||||||
| Hispanic | 3 (27%) | 6 (67%) | 1 (20%) | 0 (0%) | 6 (7%) | 4 (9%) | 4 (4%) |
| Non-Hispanic | 5 (45%) | 2 (22%) | 4 (80%) | 9 (90%) | 84 (93%) | 39 (91%) | 93 (95%) |
| Unknown | 3 (27%) | 1 (11%) | 0 (0%) | 1 (10%) | 0 (0%) | 0 (0%) | 1 (1%) |
| Education | |||||||
| Less than high school | 1 (9%) | 3 (33%) | 3 (60%) | 4 (40%) | 6 (7%) | 5 (12%) | 11 (11%) |
| High school diploma or GED | 1 (9%) | 3 (33%) | 1 (20%) | 3 (30%) | 38 (42%) | 20 (47%) | 27 (28%) |
| Some college | 2 (18%) | 1 (11%) | 1 (20%) | 2 (20%) | 23 (26%) | 9 (21%) | 20 (20%) |
| Bachelor or undergraduate degree | 4 (36%) | 1 (11%) | 0 (0%) | 0 (0%) | 17 (19%) | 6 (14%) | 24 (24%) |
| Post-undergraduate degree | 2 (18%) | 1 (11%) | 0 (0%) | 0 (0%) | 4 (4%) | 3 (7%) | 14 (14%) |
| Unknown | 1 (9%) | 0 (0%) | 0 (0%) | 1 (10%) | 2 (2%) | 0 (0%) | 2 (2%) |
| Income | |||||||
| No income | 1 (9%) | 0 (0%) | 2 (40%) | 0 (0%) |
|
|
|
| Under $10,000 | 0 (0%) | 0 (0%) | 2 (40%) | 4 (40%) |
|
|
|
| $10,000 - $30,000 | 2 (18%) | 4 (44%) | 1 (20%) | 5 (50%) |
|
|
|
| $30,000 - $50,000 | 1 (9%) | 1 (11%) | 0 (0%) | 0 (0%) |
|
|
|
| $50,000 - $60,000 | 2 (18%) | 1 (11%) | 0 (0%) | 0 (0%) |
|
|
|
| $80,000 - $100,000 | 2 (18%) | 0 (0%) | 0 (0%) | 0 (0%) |
|
|
|
| $150,000 - $200,000 | 1 (9%) | 0 (0%) | 0 (0%) | 0 (0%) |
|
|
|
| $250,000 or more | 1 (9%) | 1 (11%) | 0 (0%) | 0 (0%) |
|
|
|
| Under $15,000 |
|
|
|
|
58 (64%) | 27 (63%) | 46 (47%) |
| $15,000 - $19,999 |
|
|
|
|
8 (9%) | 3 (7%) | 7 (7%) |
| $20,000 - $39,999 |
|
|
|
|
11 (12%) | 7 (16%) | 8 (8%) |
| $40,000 - $59,999 |
|
|
|
|
4 (4%) | 2 (5%) | 9 (9%) |
| $60,000 - $79,999 |
|
|
|
|
0 (0%) | 1 (2%) | 8 (8%) |
| $80,000 or more |
|
|
|
|
4 (4%) | 1 (2%) | 15 (15%) |
| Unknown | 1 (9%) | 2 (22%) | 0 (0%) | 1 (10%) | 5 (6%) | 2 (5%) | 5 (5%) |
| Delivery Mode | |||||||
| Vaginal | 5 (45%) | 1 (11%) | 5 (100%) | 8 (80%) | 71 (79%) | 38 (88%) | 76 (78%) |
| Cesarean | 5 (45%) | 8 (89%) | 0 (0%) | 2 (20%) | 16 (18%) | 3 (7%) | 14 (14%) |
| Unknown | 1 (9%) | 0 (0%) | 0 (0%) | 0 (0%) | 3 (3%) | 2 (5%) | 8 (8%) |
#save_kable(demTable, file = file.path(figureOut, paste(Sys.Date(), "Table2_demTable.png", sep="_")), zoom=5)
Organize and set up tables from MetaPhlAn2 and Gardnerella mapping method. ## MetaPhlAn2 Species relative abundances for Stanford and UAB cohorts:
sumDF <- suMetaphlanPath %>%
read_tsv() %>%
filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
select(Species, everything(), -ID) %>%
dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{10}")) %>%
column_to_rownames(var="Species") %>%
t %>%
as.data.frame() %>%
mutate_all(funs(./100)) %>%
rownames_to_column(var="SampleID")
sumDF[1:6,1:5]
## SampleID Actinobaculum_massiliense Actinobaculum_schaalii
## 1 1000801248 0 0
## 2 1000801318 0 0
## 3 1000801368 0 0
## 4 1001301158 0 0
## 5 1001301248 0 0
## 6 1001301318 0 0
## Actinobaculum_urinale Actinomyces_europaeus
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
Species relative abundances for MOMS-PI cohort:
hmp2mDF <- hmp2MetaphlanPath %>%
read_tsv %>%
filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
select(Species, everything(), -ID) %>%
as.data.frame() %>%
dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{7}")) %>%
column_to_rownames(var="Species") %>%
t %>%
as.data.frame() %>%
mutate_all(funs(./100)) %>%
rownames_to_column(var="SampleID") %>%
filter(SampleID %in% QCfilterSamples) # keep only samples that pass quality filtering
hmp2mDF[1:6,1:5]
## SampleID Methanobrevibacter_smithii Actinobaculum_massiliense
## 1 6743909 0 0
## 2 6743911 0 0
## 3 6743912 0 0
## 4 6743914 0 0
## 5 6743916 0 0
## 6 6743917 0 0
## Actinobaculum_schaalii Actinobaculum_unclassified
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
Sanity check that relative abundances sum to 1
hmp2mDF %>%
column_to_rownames("SampleID") %>%
rowSums %>%
as.data.frame() %>%
summary
## .
## Min. :0.05339
## 1st Qu.:1.00000
## Median :1.00000
## Mean :0.99762
## 3rd Qu.:1.00000
## Max. :1.00000
hmp2mDF %>%
column_to_rownames("SampleID") %>%
rowSums %>%
as.data.frame() %>%
ggplot(aes(x=`.`)) +
geom_histogram()
Relative abundances of most samples sum to 1.
What is in samples that do not sum to 1?
hmp2MetaphlanPath %>%
read_tsv %>%
dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{7}")) %>%
select(ID, "6744527", "6744289") %>%
filter(`6744527`>0|`6744289`>0) %>%
formattable()
| ID | 6744527 | 6744289 |
|---|---|---|
| k__Bacteria | 100.00000 | 100.0000 |
| k__Bacteria|p__Actinobacteria | 94.66109 | 46.8811 |
| k__Bacteria|p__Actinobacteria|c__Actinobacteria | 94.66109 | 46.8811 |
| k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales | 94.66109 | 46.8811 |
| k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae | 94.66109 | 46.8811 |
| k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacteriaceae_unclassified | 94.66109 | 46.8811 |
| k__Bacteria|p__Firmicutes | 5.33891 | 53.1189 |
| k__Bacteria|p__Firmicutes|c__Bacilli | 5.33891 | 53.1189 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales | 5.33891 | 53.1189 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae | 5.33891 | 53.1189 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus | 5.33891 | 53.1189 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_gasseri | 1.04474 | 0.0000 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_gasseri|t__Lactobacillus_gasseri_unclassified | 1.04474 | 0.0000 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_helveticus | 4.29417 | 0.0000 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_helveticus|t__Lactobacillus_helveticus_unclassified | 4.29417 | 0.0000 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_iners | 0.00000 | 53.1189 |
| k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_iners|t__Lactobacillus_iners_unclassified | 0.00000 | 53.1189 |
Add relative abundance of individual clades and genomospecies in Stanford and UAB cohort samples, and check for differences in finding Gardnerella by MetaPhlAn2 and mapping method.
Stanford and UAB Clades
suGardCladeAb0 <- suGardCladePath %>%
read_tsv %>%
mutate(SampleID=as.character(SampleID)) %>%
select(-propClade) %>%
filter(n>=2) %>% #count as present if >=2 reads are mapped to that clade/genomospecies
group_by(SampleID) %>%
mutate(propClade=n/sum(n)) %>%
ungroup() %>%
full_join(sumDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
mutate(relativeAbundance=propClade*Gardnerella_vaginalis)
# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
suGardCladeAb0 %>%
filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 1 × 6
## SampleID Clade n propClade Gardnerella_vaginalis relativeAbundance
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 4008435358 C4 2 1 0 0
#check if there are any samples where Gard was found by metaphlan but not by mapping method
suGardCladeAb0 %>%
filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 0 × 6
## # … with 6 variables: SampleID <chr>, Clade <chr>, n <dbl>, propClade <dbl>,
## # Gardnerella_vaginalis <dbl>, relativeAbundance <dbl>
There is one sample where Gardnerella clades were found by mapping method and not by MetaPhlAn2, and no samples where Gardnerella was found by MetaPhlAn2 but not by mapping method
# make table of clade abundances
suGardCladeAb <- suGardCladeAb0 %>%
filter(relativeAbundance>0) %>%
select(SampleID, Clade, relativeAbundance)
Stanford and UAB Genomospecies
suGardGenomoAb0 <- suGardGenomoPath %>%
read_tsv %>%
mutate(SampleID=as.character(SampleID)) %>%
select(-propGenomospecies) %>%
filter(n>=2) %>%
group_by(SampleID) %>%
mutate(propGenomospecies=n/sum(n)) %>%
ungroup() %>%
full_join(sumDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
mutate(relativeAbundance=propGenomospecies*Gardnerella_vaginalis)
# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
suGardGenomoAb0 %>%
filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 0 × 6
## # … with 6 variables: SampleID <chr>, Genomospecies <chr>, n <dbl>,
## # propGenomospecies <dbl>, Gardnerella_vaginalis <dbl>,
## # relativeAbundance <dbl>
#check if there are any samples where Gard was found by metaphlan but not by mapping method
suGardGenomoAb0 %>%
filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 1 × 6
## SampleID Genomospecies n propGenomospecies Gardnerella_vaginalis
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 1061235278 <NA> NA NA 0.0152
## # … with 1 more variable: relativeAbundance <dbl>
No samples where Gardnerella genomospecies were found by mapping method but not MetaPhlAn2 and one sample where Gardnerella was found by MetaPhlAn2 but not by mapping method.
# make table of genomospecies abundances
suGardGenomoAb <- suGardGenomoAb0 %>%
filter(relativeAbundance>0) %>%
select(SampleID, Genomospecies, relativeAbundance)
MOMS-PI Clades
hmp2GardCladeAb0 <- hmp2GardCladePath %>%
read_tsv %>%
mutate(SampleID=as.character(SampleID)) %>%
select(-propClade) %>%
filter(n>=2) %>% # 2+ reads need to map to a clade/genomospecies to be considered present
group_by(SampleID) %>%
mutate(propClade=n/sum(n)) %>%
ungroup() %>%
full_join(hmp2mDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
mutate(relativeAbundance=propClade*Gardnerella_vaginalis) %>%
filter(SampleID %in% QCfilterSamples)
# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
hmp2GardCladeAb0 %>%
filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 9 × 6
## SampleID Clade n propClade Gardnerella_vaginalis relativeAbundance
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6744255 C2 2 1 0 0
## 2 6744416 C4 2 1 0 0
## 3 6744438 C3 2 1 0 0
## 4 6744478 C4 6 1 0 0
## 5 6744556 C4 2 1 0 0
## 6 6744891 C3 2 1 0 0
## 7 6744963 C4 2 1 0 0
## 8 6745078 C4 2 1 0 0
## 9 6745186 C3 2 1 0 0
#check if there are any samples where Gard was found by metaphlan but not by mapping method
hmp2GardCladeAb0 %>%
filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 58 × 6
## SampleID Clade n propClade Gardnerella_vaginalis relativeAbundance
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6743917 <NA> NA NA 0.00304 NA
## 2 6743919 <NA> NA NA 0.0261 NA
## 3 6743922 <NA> NA NA 0.00221 NA
## 4 6743987 <NA> NA NA 0.000390 NA
## 5 6743992 <NA> NA NA 0.0289 NA
## 6 6744004 <NA> NA NA 0.000170 NA
## 7 6744017 <NA> NA NA 0.105 NA
## 8 6744034 <NA> NA NA 1 NA
## 9 6744039 <NA> NA NA 0.00244 NA
## 10 6744040 <NA> NA NA 0.000451 NA
## # … with 48 more rows
There are 58 samples where Gardnerella was found by MetaPhlan2 but no clades found by mapping method, and 9 samples where Gardnerella was found by mapping method but not by MetaPhlan2
# make abundance table
hmp2GardCladeAb <- hmp2GardCladeAb0 %>%
filter(relativeAbundance>0) %>%
select(SampleID, Clade, relativeAbundance)
MOMS-PI Genomospecies
hmp2GardGenomoAb0 <- hmp2GardGenomoPath %>%
read_tsv %>%
mutate(SampleID=as.character(SampleID)) %>%
select(-propGenomospecies) %>%
filter(n>=2) %>% # 2+ reads mapped to each clade/genomospecies to be considered present
group_by(SampleID) %>%
mutate(propGenomospecies=n/sum(n)) %>%
ungroup() %>%
full_join(hmp2mDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
mutate(relativeAbundance=propGenomospecies*Gardnerella_vaginalis)
# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
hmp2GardGenomoAb0 %>%
filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 2 × 6
## SampleID Genomospecies n propGenomospecies Gardnerella_vaginalis
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 6744255 GS4 2 1 0
## 2 6744478 GS5 5 1 0
## # … with 1 more variable: relativeAbundance <dbl>
#check if there are any samples where Gard was found by metaphlan but not by mapping method
hmp2GardGenomoAb0 %>%
filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 127 × 6
## SampleID Genomospecies n propGenomospecies Gardnerella_vaginalis
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 6743917 <NA> NA NA 0.00304
## 2 6743919 <NA> NA NA 0.0261
## 3 6743922 <NA> NA NA 0.00221
## 4 6743926 <NA> NA NA 0.0128
## 5 6743955 <NA> NA NA 0.00157
## 6 6743987 <NA> NA NA 0.000390
## 7 6743992 <NA> NA NA 0.0289
## 8 6744004 <NA> NA NA 0.000170
## 9 6744017 <NA> NA NA 0.105
## 10 6744034 <NA> NA NA 1
## # … with 117 more rows, and 1 more variable: relativeAbundance <dbl>
127 samples where Gardnerella was found by MetaPhlan2 but no genomospecies were found by mapping method and 2 samples where Gardnerella was found by mapping method but not by MetaPhlan2
#make abundance table
hmp2GardGenomoAb <- hmp2GardGenomoAb0 %>%
filter(relativeAbundance>0) %>%
select(SampleID, Genomospecies, relativeAbundance)
Asses samples where Gardnerella was uncharacterized at the clade level. (Where Gardnerella was found by MetaPhlAn2 but no clades were identified using the mapping method).
unchGardSamples <- hmp2GardCladeAb0 %>%
filter(is.na(n), Gardnerella_vaginalis>0) %>%
.$SampleID
Comparison of number of microbial paired reads and proportion Gardnerella in samples with uncharacterized Gardnerella vs. samples where Gardnerella was found with both methods.
q <- hmp2mDF %>%
select(SampleID, Gardnerella_vaginalis) %>%
mutate(unchGard=case_when(SampleID %in% unchGardSamples~TRUE,
!(SampleID %in% unchGardSamples)~FALSE)) %>%
left_join(sgDF1[,c("SampleID", "bbmapFiltered_pairs", "bacLoad")], by="SampleID") %>%
ggplot(aes(x=Gardnerella_vaginalis, y=bbmapFiltered_pairs, color=unchGard, shape=unchGard)) +
geom_point(alpha=0.5) +
theme(legend.position = "bottom") +
binaryColors +
labs(x="Proportion Gardnerella", y="Microbial Reads", color="Uncharacterized Gardnerella", shape="Uncharacterized Gardnerella")
ggMarginal(q, groupFill = TRUE, type="density")
Samples with uncharacterized Gardnerella have fewer microbial reads and a lower proportion of Gardnerella
# Gardnerella abundance
gardCladeAb0 <- full_join(suGardCladeAb, hmp2GardCladeAb, by = c("SampleID", "Clade", "relativeAbundance"))
gardGenomoAb0 <- full_join(suGardGenomoAb, hmp2GardGenomoAb, by = c("SampleID", "Genomospecies", "relativeAbundance"))
# metaphlan
mDF0 <- sumDF %>%
full_join(hmp2mDF)
#number of speacies that appear in only one vs. both samples
testOverlap <- summarise_all(mDF0, anyNA) %>%
gather("Species", "anyNA", 2:ncol(.)) %>%
select(-SampleID)
nrow(testOverlap)
## [1] 364
table(testOverlap$anyNA)
##
## FALSE TRUE
## 209 155
364 total species and 209 appear in both studies (Stanford/UAB vs. MOMS-PI). Number of species in each study:
ncol(sumDF)-1
## [1] 225
ncol(hmp2mDF)-1
## [1] 348
Replace NAs with zeros for abundance values for species that did not appear in each respective study.
mDF1 <- mDF0 %>%
mutate_if(is.numeric, ~replace_na(.x, 0)) %>%
left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
select(SampleID, Cohort, everything())
anyNA(mDF1) # check for remaining NAs
## [1] FALSE
Proportion *Gardnerella* in samples by study:
mDF1 %>%
select(SampleID, Cohort, Gardnerella_vaginalis) %>%
mutate(Cohort=case_when(Cohort=="Stanford Enriched"|Cohort=="UAB Enriched"~"Stanford & UAB Enriched",
Cohort=="MOMS-PI"~"MOMS-PI"),
Cohort=factor(Cohort, levels = c("Stanford & UAB Enriched", "MOMS-PI"))) %>%
ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort, group=Cohort)) +
geom_density(alpha=0.5) +
binaryColors +
labs(x="Proportion Gardnerella")
gardAbundance2Cohort <- mDF1 %>%
select(SampleID, Cohort, Gardnerella_vaginalis) %>%
left_join(sgDF1[,c("SampleID", "SubjectID")], by="SampleID") %>%
mutate(Cohort=case_when(Cohort=="Stanford Enriched"|Cohort=="UAB Enriched"~"Stanford & UAB Enriched",
Cohort=="MOMS-PI"~"MOMS-PI"),
Cohort=factor(Cohort, levels = c("Stanford & UAB Enriched", "MOMS-PI")))
gardAbundance2Cohort %>%
group_by(Cohort) %>%
summarize(min=min(Gardnerella_vaginalis),
`1stQu`=summary(Gardnerella_vaginalis)[2],
median=median(Gardnerella_vaginalis),
mean=mean(Gardnerella_vaginalis),
`3rdQu`=summary(Gardnerella_vaginalis)[5],
max=max(Gardnerella_vaginalis)) %>%
formattable()
| Cohort | min | 1stQu | median | mean | 3rdQu | max |
|---|---|---|---|---|---|---|
| Stanford & UAB Enriched | 0 | 0.1712211 | 0.5132085 | 0.4505522 | 0.6993391 | 0.9814731 |
| MOMS-PI | 0 | 0.0036418 | 0.0566380 | 0.2401044 | 0.4823945 | 1.0000000 |
Proportion Gardnerella by as subject averages by study:
subjectGardAbundance2Cohort <- gardAbundance2Cohort %>%
group_by(SubjectID, Cohort) %>%
summarize(Gardnerella_vaginalis=mean(Gardnerella_vaginalis)) %>%
ungroup
subjectAvgPropGard <- subjectGardAbundance2Cohort %>%
ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort)) +
geom_density(alpha=0.5) +
binaryColors +
theme(legend.position = "bottom",
legend.text = element_text(size=10),
legend.title = element_text(size=11)) +
labs(x="Subject Average Proportion Gardnerella")
subjectAvgPropGard
MGS sequenced samples from Stanford and UAB were selected based on enrichment of Gardnerella at the subject level. Create a cohort of a subset of samples in the MOMS-PI cohort that are also selected for enrichment of Gardnerella at the subject level for better comparison.
ntiles <- subjectGardAbundance2Cohort %>%
filter(Cohort=="Stanford & UAB Enriched") %>%
mutate(decile=ntile(Gardnerella_vaginalis, 3)) %>%
group_by(decile) %>%
filter(Gardnerella_vaginalis==min(Gardnerella_vaginalis)) %>%
.$Gardnerella_vaginalis %>%
sort %>%
unique
ntiles
## [1] 0.0000000 0.3569745 0.6732104
# subjectGardAbundance2Cohort0 <- subjectGardAbundance2Cohort %>%
# filter(Cohort=="MOMS-PI",
# SubjectID %in% hmp2SubjectsWDel) %>% # *** For selecting subjects that have gestational age at delivery ***
# mutate(Ntile=case_when(between(Gardnerella_vaginalis, ntiles[1], ntiles[2])~1,
# between(Gardnerella_vaginalis, ntiles[2], ntiles[3])~2,
# Gardnerella_vaginalis > ntiles[3]~3))
#
# subjN <- min(count(subjectGardAbundance2Cohort0, Ntile)$n) # number of subjects from each ntile to take
#
# HMP2highGardSubjects <- subjectGardAbundance2Cohort0 %>%
# group_by(Ntile) %>%
# sample_n(subjN, replace = FALSE) %>%
# .$SubjectID
#
# write_lines(HMP2highGardSubjects, "./HMP2highGardSubjects.txt")
HMP2highGardSubjects <- read_lines("./HMP2highGardSubjects.txt")
Figure S4: Distributions of subject average Gardnerella abundances
#fig.width=5, fig.height=2.5
subjectAvgPropGardEnr <- subjectGardAbundance2Cohort %>%
filter(Cohort=="Stanford & UAB Enriched"|(Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects)) %>%
mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort, group=Cohort)) +
geom_density(alpha=0.5) +
binaryColors +
theme(legend.position = "bottom",
legend.text = element_text(size=10),
legend.title = element_text(size=11)) +
labs(x="Subject Average Proportion Gardnerella")
plot_grid(subjectAvgPropGard, subjectAvgPropGardEnr, nrow=1, labels = c("A", "B"), label_size = 15)
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS4_enrichedSubjectPropGardbyStudy.png", sep="_")))
subjectGardAbundance2Cohort %>%
group_by(Cohort) %>%
summarize(min=min(Gardnerella_vaginalis),
`1stQu`=summary(Gardnerella_vaginalis)[2],
median=median(Gardnerella_vaginalis),
mean=mean(Gardnerella_vaginalis),
`3rdQu`=summary(Gardnerella_vaginalis)[5],
max=max(Gardnerella_vaginalis)) %>%
formattable()
| Cohort | min | 1stQu | median | mean | 3rdQu | max |
|---|---|---|---|---|---|---|
| Stanford & UAB Enriched | 0 | 0.25958767 | 0.4686271 | 0.4515941 | 0.6906473 | 0.9041753 |
| MOMS-PI | 0 | 0.01845903 | 0.1567142 | 0.2396632 | 0.3973811 | 0.9826293 |
subjectGardAbundance2Cohort %>%
filter((Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects)) %>%
mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
group_by(Cohort) %>%
summarize(min=min(Gardnerella_vaginalis),
`1stQu`=summary(Gardnerella_vaginalis)[2],
median=median(Gardnerella_vaginalis),
mean=mean(Gardnerella_vaginalis),
`3rdQu`=summary(Gardnerella_vaginalis)[5],
max=max(Gardnerella_vaginalis)) %>%
formattable()
| Cohort | min | 1stQu | median | mean | 3rdQu | max |
|---|---|---|---|---|---|---|
| MOMS-PI Enriched | 0.00218635 | 0.2808695 | 0.4526737 | 0.4684783 | 0.722244 | 0.8500606 |
Distribution of Gardnerella abundnace in samples
HMP2HighGardSamples <- sgDF1 %>%
filter(SubjectID %in% HMP2highGardSubjects) %>%
.$SampleID
gardAbundance2Cohort %>%
filter(Cohort=="Stanford & UAB Enriched"|Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects) %>%
mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort)) +
geom_density(alpha=0.5) +
binaryColors +
labs(x="Proportion of Gardnerella")
Add enriched MOMS-PI cohort to tables Sample and subject metadata:
# extract enriched Gardnerella subset and full join to metadata
highGardSgDF <- sgDF1 %>%
filter(SubjectID %in% HMP2highGardSubjects) %>%
mutate(Cohort="MOMS-PI Enriched",
SampleID=paste0(SampleID, "_E"),
SubjectID=paste0(SubjectID, "_E"))
nrow(highGardSgDF)
## [1] 145
sgDF <- sgDF1 %>%
mutate(Cohort=as.vector(Cohort),
SubjectID=as.character(SubjectID)) %>%
full_join(highGardSgDF, by=colnames(sgDF1)) %>%
mutate(Cohort=factor(Cohort, levels = cohorts))
bacLoad <- sgDF %>%
select(SampleID, bacLoad)
sgDF %>%
group_by(Cohort) %>%
summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
formattable()
| Cohort | N Subjects | N Samples |
|---|---|---|
| Stanford Enriched | 20 | 62 |
| UAB Enriched | 15 | 45 |
| MOMS-PI Enriched | 42 | 145 |
| MOMS-PI | 231 | 781 |
MetaPhlAn2 species abundance tables:
highGardmDF <- mDF1 %>%
filter(SampleID %in% HMP2HighGardSamples) %>%
mutate(Cohort="MOMS-PI Enriched",
SampleID=paste0(SampleID, "_E"))
mDF <- mDF1 %>%
mutate(Cohort=as.vector(Cohort)) %>%
rbind(highGardmDF) %>%
mutate(Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI Enriched", "MOMS-PI")))
Gardnerella clade and genomospecies abundance tables:
# Clade Abundances
gardCladeAb1 <- gardCladeAb0 %>%
left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
mutate(Cohort=as.vector(Cohort))
highGardCladeAb <- gardCladeAb1 %>%
filter(SampleID %in% HMP2HighGardSamples) %>%
mutate(Cohort="MOMS-PI Enriched",
SampleID=paste0(SampleID, "_E"))
gardCladeAb <- gardCladeAb1 %>%
rbind(highGardCladeAb) %>%
mutate(Cohort=factor(Cohort, levels = cohorts))
# Gemomospecies Abundances
gardGenomoAb1 <- gardGenomoAb0 %>%
left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
mutate(Cohort=as.vector(Cohort))
highGardGenomoAb <- gardGenomoAb1 %>%
filter(SampleID %in% HMP2HighGardSamples) %>%
mutate(Cohort="MOMS-PI Enriched",
SampleID=paste0(SampleID, "_E"))
gardGenomoAb <- gardGenomoAb1 %>%
rbind(highGardGenomoAb) %>%
mutate(Cohort=factor(Cohort, levels = cohorts))
# note these tables contain samples with at least one identified clade or genomospecies of gardnerella
avgTop20Abundance0 <- mDF %>%
select(SampleID, Cohort, everything()) %>%
gather("Species", "Abundance", 3:ncol(.)) %>%
group_by(Cohort, Species) %>%
summarise(avgAbundance=mean(Abundance))
top20taxa <- avgTop20Abundance0 %>%
group_by(Cohort) %>%
top_n(20, avgAbundance) %>%
.$Species %>%
unique
avgTop20Abundance0 %>%
spread(Cohort, avgAbundance) %>%
mutate(stanfordRank=rank(-`Stanford Enriched`),
uabRank=rank(-`UAB Enriched`),
momspiRank=rank(-`MOMS-PI`),
momspiERank=rank(-`MOMS-PI Enriched`),
Stanford=round(`Stanford Enriched`, 2),
UAB=round(`UAB Enriched`,2),
`MOMS-PI`=round(`MOMS-PI`, 2),
`MOMS-PI Enriched`=round(`MOMS-PI Enriched`, 2)) %>%
filter(Species %in% top20taxa) %>%
select(Species, `Stanford Enriched`, stanfordRank, `UAB Enriched`, uabRank, `MOMS-PI Enriched`, momspiERank, `MOMS-PI`, momspiRank) %>%
arrange(stanfordRank) %>%
formattable()
| Species | Stanford Enriched | stanfordRank | UAB Enriched | uabRank | MOMS-PI Enriched | momspiERank | MOMS-PI | momspiRank |
|---|---|---|---|---|---|---|---|---|
| Gardnerella_vaginalis | 3.701475e-01 | 1 | 5.613321e-01 | 1.0 | 0.46 | 1 | 0.24 | 2 |
| Lactobacillus_iners | 1.657524e-01 | 2 | 1.991789e-01 | 2.0 | 0.24 | 2 | 0.33 | 1 |
| Lactobacillus_crispatus | 1.387812e-01 | 3 | 3.084282e-02 | 4.0 | 0.06 | 3 | 0.17 | 3 |
| Lactobacillus_gasseri | 9.288219e-02 | 4 | 1.278889e-05 | 111.0 | 0.01 | 10 | 0.04 | 5 |
| Atopobium_vaginae | 5.537944e-02 | 5 | 7.964385e-02 | 3.0 | 0.06 | 4 | 0.03 | 6 |
| Lactobacillus_jensenii | 5.374158e-02 | 6 | 5.861467e-04 | 34.0 | 0.03 | 5 | 0.05 | 4 |
| Prevotella_timonensis | 1.395939e-02 | 7 | 4.420527e-03 | 12.0 | 0.02 | 9 | 0.01 | 7 |
| Ureaplasma_parvum | 8.847260e-03 | 8 | 1.727678e-03 | 20.0 | 0.00 | 15 | 0.01 | 13 |
| Prevotella_bivia | 7.590635e-03 | 9 | 1.827222e-02 | 5.0 | 0.01 | 12 | 0.01 | 14 |
| Finegoldia_magna | 7.540677e-03 | 10 | 1.186363e-02 | 7.0 | 0.00 | 33 | 0.00 | 19 |
| Megasphaera_genomosp_type_1 | 7.273266e-03 | 11 | 8.389764e-03 | 9.0 | 0.02 | 7 | 0.01 | 9 |
| Prevotella_amnii | 7.232839e-03 | 12 | 1.149809e-02 | 8.0 | 0.02 | 8 | 0.01 | 12 |
| Mycoplasma_hominis | 6.399960e-03 | 13 | 1.373242e-02 | 6.0 | 0.01 | 11 | 0.01 | 10 |
| Megasphaera_unclassified | 6.204035e-03 | 14 | 5.562682e-03 | 11.0 | 0.01 | 13 | 0.00 | 17 |
| Ureaplasma_urealyticum | 5.318142e-03 | 15 | 1.671022e-03 | 21.0 | 0.00 | 23 | 0.00 | 27 |
| Varibaculum_cambriense | 3.935813e-03 | 16 | 6.948800e-04 | 33.0 | 0.00 | 67 | 0.00 | 71 |
| Dialister_micraerophilus | 3.856794e-03 | 17 | 2.908351e-03 | 14.0 | 0.01 | 14 | 0.00 | 15 |
| Veillonella_unclassified | 3.408242e-03 | 18 | 7.272222e-05 | 78.0 | 0.00 | 120 | 0.00 | 72 |
| Peptoniphilus_harei | 3.114544e-03 | 19 | 7.010158e-03 | 10.0 | 0.00 | 37 | 0.00 | 23 |
| Oligella_urethralis | 2.957774e-03 | 20 | 3.871511e-04 | 45.0 | 0.00 | 113 | 0.00 | 41 |
| Prevotella_disiens | 2.237563e-03 | 21 | 3.915562e-03 | 13.0 | 0.00 | 39 | 0.00 | 20 |
| Streptococcus_anginosus | 1.453100e-03 | 26 | 2.524624e-03 | 15.0 | 0.00 | 31 | 0.00 | 32 |
| Bifidobacterium_breve | 1.129426e-03 | 27 | 1.512753e-03 | 22.0 | 0.00 | 18 | 0.00 | 30 |
| Peptoniphilus_lacrimalis | 1.115758e-03 | 29 | 2.299044e-03 | 18.0 | 0.00 | 42 | 0.00 | 49 |
| Corynebacterium_jeikeium | 5.682258e-04 | 39 | 1.828533e-03 | 19.0 | 0.00 | 115 | 0.00 | 171 |
| Streptococcus_mitis_oralis_pneumoniae | 5.078548e-04 | 42 | 1.508578e-04 | 72.0 | 0.00 | 19 | 0.00 | 24 |
| Lactobacillus_phage_Lv_1 | 4.896242e-04 | 44 | 0.000000e+00 | 255.5 | 0.00 | 16 | 0.01 | 8 |
| Corynebacterium_sp_HFH0082 | 3.812645e-04 | 50 | 2.431147e-03 | 16.0 | 0.00 | 256 | 0.00 | 39 |
| Clostridiales_genomosp_BVAB3 | 3.396290e-05 | 103 | 8.818867e-04 | 25.0 | 0.00 | 20 | 0.00 | 43 |
| Bacteroides_fragilis | 8.709677e-07 | 167 | 0.000000e+00 | 255.5 | 0.00 | 17 | 0.00 | 59 |
| Lactobacillus_delbrueckii | 4.419355e-07 | 173 | 0.000000e+00 | 255.5 | 0.00 | 256 | 0.00 | 16 |
| Prevotella_melaninogenica | 6.935484e-08 | 184 | 2.406838e-03 | 17.0 | 0.00 | 110 | 0.00 | 69 |
| Bradyrhizobium_sp_DFCI_1 | 0.000000e+00 | 275 | 0.000000e+00 | 255.5 | 0.02 | 6 | 0.01 | 11 |
| Lactobacillus_helveticus | 0.000000e+00 | 275 | 0.000000e+00 | 255.5 | 0.00 | 256 | 0.00 | 18 |
x <- mDF %>%
select(SampleID, Cohort, top20taxa) %>%
mutate_if(is.numeric, ~case_when(.x>0~1,
.x==0~0))%>%
group_by(Cohort) %>%
summarize_if(is.numeric, ~sum(.x)/n()) %>%
data.frame()
rownames(x) <- x$Cohort
x %>%
select(-Cohort) %>%
t %>%
as.data.frame() %>%
rownames_to_column("Species") %>%
arrange(-`Stanford Enriched`) %>%
formattable()
| Species | Stanford Enriched | UAB Enriched | MOMS-PI Enriched | MOMS-PI |
|---|---|---|---|---|
| Gardnerella_vaginalis | 0.85483871 | 0.95555556 | 0.958620690 | 0.852752881 |
| Atopobium_vaginae | 0.62903226 | 0.84444444 | 0.731034483 | 0.494238156 |
| Finegoldia_magna | 0.58064516 | 0.60000000 | 0.131034483 | 0.186939821 |
| Dialister_micraerophilus | 0.54838710 | 0.80000000 | 0.655172414 | 0.400768246 |
| Prevotella_timonensis | 0.53225806 | 0.71111111 | 0.468965517 | 0.368758003 |
| Lactobacillus_iners | 0.45161290 | 0.91111111 | 0.924137931 | 0.834827145 |
| Prevotella_bivia | 0.41935484 | 0.55555556 | 0.289655172 | 0.239436620 |
| Lactobacillus_gasseri | 0.38709677 | 0.08888889 | 0.082758621 | 0.161331626 |
| Peptoniphilus_harei | 0.38709677 | 0.66666667 | 0.172413793 | 0.167733675 |
| Ureaplasma_parvum | 0.35483871 | 0.53333333 | 0.455172414 | 0.430217670 |
| Megasphaera_genomosp_type_1 | 0.33870968 | 0.80000000 | 0.620689655 | 0.395646607 |
| Prevotella_disiens | 0.33870968 | 0.60000000 | 0.117241379 | 0.157490397 |
| Lactobacillus_crispatus | 0.27419355 | 0.08888889 | 0.296551724 | 0.362355954 |
| Megasphaera_unclassified | 0.27419355 | 0.57777778 | 0.393103448 | 0.218950064 |
| Prevotella_amnii | 0.25806452 | 0.46666667 | 0.462068966 | 0.271446863 |
| Peptoniphilus_lacrimalis | 0.25806452 | 0.46666667 | 0.137931034 | 0.134443022 |
| Streptococcus_anginosus | 0.25806452 | 0.40000000 | 0.089655172 | 0.084507042 |
| Lactobacillus_jensenii | 0.24193548 | 0.15555556 | 0.193103448 | 0.307298335 |
| Ureaplasma_urealyticum | 0.20967742 | 0.28888889 | 0.089655172 | 0.056338028 |
| Bifidobacterium_breve | 0.19354839 | 0.15555556 | 0.068965517 | 0.042253521 |
| Varibaculum_cambriense | 0.17741935 | 0.28888889 | 0.027586207 | 0.070422535 |
| Mycoplasma_hominis | 0.16129032 | 0.68888889 | 0.344827586 | 0.243277849 |
| Corynebacterium_jeikeium | 0.12903226 | 0.26666667 | 0.020689655 | 0.026888604 |
| Oligella_urethralis | 0.06451613 | 0.06666667 | 0.013793103 | 0.021766965 |
| Streptococcus_mitis_oralis_pneumoniae | 0.06451613 | 0.04444444 | 0.041379310 | 0.040973111 |
| Veillonella_unclassified | 0.04838710 | 0.08888889 | 0.013793103 | 0.024327785 |
| Clostridiales_genomosp_BVAB3 | 0.04838710 | 0.35555556 | 0.206896552 | 0.107554417 |
| Corynebacterium_sp_HFH0082 | 0.03225806 | 0.08888889 | 0.000000000 | 0.003841229 |
| Bacteroides_fragilis | 0.03225806 | 0.00000000 | 0.006896552 | 0.003841229 |
| Prevotella_melaninogenica | 0.01612903 | 0.17777778 | 0.013793103 | 0.025608195 |
| Lactobacillus_phage_Lv_1 | 0.01612903 | 0.00000000 | 0.020689655 | 0.026888604 |
| Lactobacillus_delbrueckii | 0.01612903 | 0.00000000 | 0.000000000 | 0.010243278 |
| Bradyrhizobium_sp_DFCI_1 | 0.00000000 | 0.00000000 | 0.103448276 | 0.062740077 |
| Lactobacillus_helveticus | 0.00000000 | 0.00000000 | 0.000000000 | 0.011523688 |
Create abundance table with taxa of interest
# abundance table of taxa of interest with clades
mDFtoiClades <- gardCladeAb %>%
spread(Clade, relativeAbundance) %>%
full_join(mDF, by=c("SampleID", "Cohort")) %>%
select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobes) %>% # keep total Gardnerella abundance in table for now. Many operations are taxon vs. taxon. Will need to remove this column when analyses call for it.
mutate_at(vars(c(Gardnerella_vaginalis, clades, lactos, anaerobes)), replace_na, 0) %>%
mutate(Cohort=factor(Cohort, levels=cohorts))
gardCladeAbU <- gardCladeAb %>%
full_join(mDF[,c("SampleID", "Cohort", "Gardnerella_vaginalis")], by=c("SampleID","Cohort")) %>%
spread(Clade, relativeAbundance) %>%
select(-`<NA>`) %>% # remove "NA column" created by samples that have zero Gardnerella
mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>=0&is.na(C1)&is.na(C2)&is.na(C3)&is.na(C4)&is.na(C5)&is.na(C6)~Gardnerella_vaginalis,
Gardnerella_vaginalis>0&!is.na(clades)~0)) %>%
gather("var", "relativeAbundance", c(clades, Gardnerella_vaginalis, Uncharacterized_Gardnerella)) %>%
replace_na(list(relativeAbundance=0))
Average clades per sample in each cohorts:
gardCladeAbU %>%
spread(var, relativeAbundance) %>%
mutate_at(clades, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
with_groups(Cohort, summarize, mean=mean(nClades), `standard deviation`=sd(nClades)) %>%
formattable
| Cohort | mean | standard deviation |
|---|---|---|
| Stanford Enriched | 3.016129 | 1.894703 |
| UAB Enriched | 4.777778 | 1.593864 |
| MOMS-PI Enriched | 3.772414 | 1.892033 |
| MOMS-PI | 2.655570 | 2.088537 |
Average clades per sample across all cohorts:
gardCladeAbU %>%
spread(var, relativeAbundance) %>%
mutate_at(clades, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
summarize(mean=mean(nClades), `standard deviation`=sd(nClades)) %>%
formattable
| mean | standard deviation |
|---|---|
| 2.926428 | 2.103064 |
cladesPerSample <- gardCladeAbU %>%
spread(var, relativeAbundance) %>%
mutate_at(clades, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
count(Cohort, nClades) %>%
group_by(Cohort) %>%
mutate(n=n/sum(n)) %>%
ggplot(aes(x=nClades, y=n)) +
geom_bar(stat="identity", fill = "black") +
facet_wrap(~Cohort) +
labs(x="Clades per Sample", y="Proportion of Samples", fill=bquote('Uncharacterized'~italic(Gardnerella))) +
theme(axis.text.x = element_text(size=9),
strip.text = element_text(size=13),
aspect.ratio = 0.5)
gardGenomoAbU <- gardGenomoAb %>%
spread(Genomospecies, relativeAbundance) %>%
full_join(mDF[,c("SampleID", "Cohort", "Gardnerella_vaginalis")], by=c("SampleID", "Cohort")) %>%
mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>=0&is.na(GS1)&is.na(GS2)&is.na(GS3)&is.na(GS4)&is.na(GS5)&is.na(GS6)&is.na(GS7)&is.na(GS8)&is.na(GS9)&is.na(GS10)&is.na(GS11)&is.na(GS12)&is.na(GS13)&is.na(GS14)~Gardnerella_vaginalis,
Gardnerella_vaginalis>0&!is.na(genomos)~0)) %>%
gather("var", "relativeAbundance", c(genomos, Gardnerella_vaginalis, Uncharacterized_Gardnerella)) %>%
replace_na(list(relativeAbundance=0))
Average clades per sample in each cohorts:
gardGenomoAbU %>%
spread(var, relativeAbundance) %>%
mutate_at(genomos, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
with_groups(Cohort, summarize, mean=mean(nGenomos), `standard deviation`=sd(nGenomos)) %>%
formattable
| Cohort | mean | standard deviation |
|---|---|---|
| Stanford Enriched | 4.016129 | 3.226387 |
| UAB Enriched | 9.177778 | 4.339506 |
| MOMS-PI Enriched | 5.379310 | 3.811658 |
| MOMS-PI | 3.382843 | 3.750646 |
Number of samples with all 14 genomospecies
gardGenomoAbU %>%
spread(var, relativeAbundance) %>%
mutate_at(genomos, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
count(Cohort, nGenomos) %>%
group_by(Cohort) %>%
mutate(pcnt=n/sum(n)*100) %>% # convert to fraction of sample
filter(nGenomos==14) %>%
formattable
| Cohort | nGenomos | n | pcnt |
|---|---|---|---|
| Stanford Enriched | 14 | 1 | 1.6129032 |
| UAB Enriched | 14 | 12 | 26.6666667 |
| MOMS-PI Enriched | 14 | 1 | 0.6896552 |
| MOMS-PI | 14 | 6 | 0.7682458 |
Average clades per sample across all cohorts:
gardGenomoAbU %>%
spread(var, relativeAbundance) %>%
mutate_at(genomos, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
summarize(mean=mean(nGenomos), `standard deviation`=sd(nGenomos)) %>%
formattable
| mean | standard deviation |
|---|---|
| 3.953533 | 3.974942 |
genomosPerSample <- gardGenomoAbU %>%
spread(var, relativeAbundance) %>%
mutate_at(genomos, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
count(Cohort, nGenomos) %>%
group_by(Cohort) %>%
mutate(n=n/sum(n), # convert to fraction of sample
nGenomos=factor(nGenomos, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"))) %>%
ggplot(aes(x=nGenomos, y=n)) +
geom_bar(stat="identity", fill = "black") +
facet_wrap(~Cohort) +
labs(x="Genomospecies per Sample", y="Proportion of Samples") +
theme(axis.text.x = element_text(size=9),
strip.text = element_text(size=13),
aspect.ratio = 0.5)
Calculate clade prevalences
prevTable <- gardCladeAbU %>%
with_groups(c(Cohort, var), summarize, p = sum(relativeAbundance > 0), n = n(), m=mean(relativeAbundance), sd=sd(relativeAbundance)) %>%
mutate(prevalence=round(p/n, 2),
var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels=c(clades, "Unch.", "Total")))
Sample relative abundances with prevalence
#fig.width=7, fig.height=3.5
gardCladeAbU %>%
filter(var!="Uncharacterized_Gardnerella") %>%
mutate(var=factor(var, levels = c(clades, "Gardnerella_vaginalis"), labels=c(clades, "Total"))) %>%
left_join(prevTable[,c("Cohort", "var", "prevalence")], by=c("Cohort", "var")) %>%
mutate(Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs)) %>%
ggplot(aes(x=var, y=relativeAbundance, color=var, fill=var)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
geom_text(aes(y=1.2, x=var, label=prevalence), color="black", size=5) +
scale_y_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1, 1.2), limits = c(0,1.3), labels=c("0.00", "0.25", "0.50", "0.75", "1.00", "prevalence")) +
cladeColorsTotal +
facet_grid(Cohort~.) +
theme(legend.position = "none",
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size=16),
axis.title.y = element_text(size=23),
strip.text = element_text(size=17)) +
labs(x=NULL, y="Relative Abundance")
Clades differ in abundance and abundance of clades varies across cohorts. Clade 3 is less abundant in Stanford cohort but abundant in other cohorts, especially UAB. Clade 1 is more abundant in Stanford. Clade 6 is rare in Stanford but appears in UAB and MOMS-PI cohorts.
Prevalence vs relative abundance of Gardnerella clades
cladeAbvPrev <- prevTable %>%
filter(var!="Unch.") %>%
ggplot(aes(x=prevalence, y=m, color=var)) +
geom_point(size=3, alpha=0.75) +
scale_x_continuous(limits=c(0,1), breaks=seq(0,1,0.2)) +
scale_y_continuous(limits =c(0,1), breaks=seq(0,1,0.2)) +
facet_wrap(~Cohort) +
cladeColorsTotal +
coord_fixed() +
theme(legend.position = "right",
strip.text = element_text(size=13)) +
labs(x="Prevalence", y="Mean Relative Abundance", color="Clade")
Calculate genomospecies prevalences
genomosPrevTable <- gardGenomoAbU %>%
with_groups(c(Cohort, var), summarize, p = sum(relativeAbundance > 0), n = n(), m=mean(relativeAbundance), sd=sd(relativeAbundance)) %>%
mutate(prevalence=round(p/n, 2))
Sample relative abundance with prevalence
#fig.height=3, fig.width=7
gardGenomoAbU %>%
left_join(genomosPrevTable[,c("Cohort", "var", "prevalence")], by=c("Cohort", "var")) %>%
filter(var!="Uncharacterized_Gardnerella") %>%
mutate(var=factor(var, levels=c(genomosCladeOrder, "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total")),
Cohort=factor(Cohort, levels=cohorts, labels = cohortAbbrevs)) %>%
ggplot(aes(x=var, y=relativeAbundance, color=var, fill=var)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
geom_text(aes(y=1.2, x=var, label=prevalence), color="black", size=5) +
scale_y_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1, 1.2), limits = c(0,1.3), labels=c("0.00", "0.25", "0.50", "0.75", "1.00", "prevalence")) +
facet_grid(Cohort~.) +
genomoColorsTotal +
theme(axis.text.x = element_text(size=11),
axis.text.y = element_text(size=12),
axis.title.y = element_text(size=20),
strip.text = element_text(size = 11),
legend.position="none") +
labs(x=NULL, y="Relative Abundance")
Prevalence vs relative abundance of Gardnerella genomospecies
genomoAbvPrev <- gardGenomoAbU %>%
group_by(Cohort, var) %>%
summarise(avgAbundance=mean(relativeAbundance)) %>%
left_join(genomosPrevTable, by=c("Cohort", "var")) %>%
filter(var!="Uncharacterized_Gardnerella") %>%
dplyr::rename(Genomospecies=var) %>%
mutate(Genomospecies=factor(Genomospecies, levels = c(genomosCladeOrder, "Gardnerella_vaginalis"))) %>%
ggplot(aes(x=prevalence, y=avgAbundance, color=Genomospecies)) +
geom_point(size=3, alpha=0.75) +
scale_x_continuous(limits=c(0,1), breaks=seq(0,1,0.2)) +
scale_y_continuous(limits =c(0,1), breaks=seq(0,1,0.2)) +
genomoColorsTotal +
facet_wrap(~Cohort) +
coord_fixed() +
theme(legend.position = "right",
strip.text = element_text(size=13)) +
labs(x="Prevalence", y="Mean Relative Abundance", color="Species")
#fig.width = 6, fig.height=4
varsPerSample <- plot_grid(cladesPerSample, genomosPerSample, nrow = 1, labels = c("A", "B"), label_size = 15)
gardVarAbun <- plot_grid(cladeAbvPrev, genomoAbvPrev, nrow = 1, labels = c("C", "D"), label_size = 15)
plot_grid(varsPerSample, gardVarAbun, nrow = 2, axis = "l")
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "Figure3_cladeGenomoPrevAbun.png", sep="_")))
Compare microbial load across the four cohorts:
# table of descriptive statistics with outliers
sgDF %>%
group_by(Cohort) %>%
summarise_at("bacLoad", list(min=min, mean=mean, median=median, max=max)) %>%
mutate_if(is.numeric, ~round(.x, 4)) %>%
formattable()
| Cohort | min | mean | median | max |
|---|---|---|---|---|
| Stanford Enriched | 0.0035 | 2.5594 | 0.0504 | 150.0600 |
| UAB Enriched | 0.0036 | 0.4658 | 0.2504 | 6.0354 |
| MOMS-PI Enriched | 0.0111 | 0.2492 | 0.1259 | 4.7932 |
| MOMS-PI | 0.0038 | 0.2074 | 0.0446 | 47.6365 |
# microbial load of outliers
sgDF %>%
filter(bacLoad>2) %>%
select(SampleID, Cohort, bacLoad) %>%
formattable
| SampleID | Cohort | bacLoad |
|---|---|---|
| 4009835178 | UAB Enriched | 6.035434 |
| 1005601348 | Stanford Enriched | 150.059967 |
| 6744677 | MOMS-PI | 4.103960 |
| 6743991 | MOMS-PI | 12.019563 |
| 6744354 | MOMS-PI | 47.636472 |
| 6744871 | MOMS-PI | 4.793229 |
| 6744459 | MOMS-PI | 4.739440 |
| 6744677_E | MOMS-PI Enriched | 4.103960 |
| 6744871_E | MOMS-PI Enriched | 4.793229 |
# density plot without outliers (bacload ≥ 2)
sgDF %>%
filter(bacLoad<2) %>%
mutate(Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
ggplot(aes(x=bacLoad, fill=Cohort, color=Cohort)) +
geom_density(alpha=0.5) +
quadColors2 +
theme(legend.position = c(0.85,0.8)) +
labs(x="Microbial Load")
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS5_bacLoadDensNoOut.png", sep="_")))
# table of descriptive statistics without outliers
sgDF %>%
filter(bacLoad<2) %>%
group_by(Cohort) %>%
summarise_at("bacLoad", list(min=min, mean=mean, median=median, max=max)) %>%
mutate_if(is.numeric, ~round(.x, 4)) %>%
formattable()
| Cohort | min | mean | median | max |
|---|---|---|---|---|
| Stanford Enriched | 0.0035 | 0.1414 | 0.0498 | 1.0077 |
| UAB Enriched | 0.0036 | 0.3392 | 0.2417 | 1.2209 |
| MOMS-PI Enriched | 0.0111 | 0.1905 | 0.1201 | 1.4112 |
| MOMS-PI | 0.0038 | 0.1143 | 0.0441 | 1.8656 |
All samples absolute abundance
#fig.width=7, fig.height=3.5
gardCladeAbU %>%
left_join(bacLoad, by="SampleID") %>%
filter(bacLoad<2) %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad,
var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(clades, "Unch.", "Total")),
Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs)) %>%
ggplot(aes(x=var, y=absoluteAbundance, color=var)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
cladeColorsTotalUnch +
scale_y_continuous(limits=c(0,1), n.breaks = 5) +
facet_grid(Cohort~.) +
theme(legend.position = "none",
axis.text.x = element_text(size=16),
axis.text.y = element_text(size=16),
axis.title.y = element_text(size=23),
strip.text = element_text(size=17)) +
labs(x=NULL, y="Absolute Abundance")
Clades 3 and 4 appear to show the greatest absolute abundance.
All samples absolute abundance
#fig.height=3, fig.width=7
gardGenomoAbU %>%
left_join(bacLoad, by="SampleID") %>%
filter(bacLoad<2) %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad,
var=factor(var, levels=c(genomosCladeOrder, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Unch.", "Total")),
Cohort=factor(Cohort, levels = cohorts, labels = cohortAbbrevs)) %>%
ggplot(aes(x=var, y=absoluteAbundance, color=var)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
facet_grid(Cohort~.) +
genomoColorsTotalUnch +
theme(axis.text.x = element_text(size=11),
axis.text.y = element_text(size=12),
axis.title.y = element_text(size=20),
strip.text = element_text(size = 11),
legend.position="none") +
labs(x=NULL, y="Absolute Abundance")
Genomospecies G. swidsinskii and genomospecies 7 appear to show the greatest absolute abundance.
Determine what threshold will be used to count a taxon as present vs. absent for co-occurrence and presence absence vs. microbial load analyses
MetaPhlAn2 will report all taxa that have 10% of marker genes non-zero based on this Google group response from Nicola Segata. Average number of marker genes per bacterial species is 184 ± 45. (Truong et al., 2015, Nature Mathods “MetaPhlAn2 for enhanced metagenomic taxonomic profiling”). About 1 million markers from >7,500 species. Minimum number of reads to be detected would be 18.4.
sgDF1 %>%
mutate(ntile=ntile(bbmapFiltered_pairs, n=20)) %>%
filter(ntile==5) %>%
filter(bbmapFiltered_pairs==min(bbmapFiltered_pairs)) %>%
.$bbmapFiltered_pairs
## [1] 104260
sgDF1 %>%
mutate(over30K=bbmapFiltered_pairs>100000) %>%
count(over30K) %>%
mutate(freq=n/sum(n))
## # A tibble: 2 × 3
## over30K n freq
## <lgl> <int> <dbl>
## 1 FALSE 165 0.186
## 2 TRUE 723 0.814
80% of samples have greater than 100,000 read pairs. MetaPhlAn2 uses about 10% of the reads in the library, and 18.4/10,000 yields a functional limit of detection of roughly 0.1%.
18.4/10000
## [1] 0.00184
A threshold of 0.1% has been used frequently in presence-absence analyses in microbiome data and will be used as a threshold here.
loadVClades <- gardCladeAbU %>%
spread(var, relativeAbundance) %>%
select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
mutate_at(clades, ~case_when(.x>0~1,
.x==0~0)) %>%
gather("var", "abundance", clades) %>%
group_by(SampleID, Cohort) %>%
summarize(nClades=sum(abundance)) %>%
left_join(bacLoad, by="SampleID") %>%
filter(bacLoad<2) %>%
ggplot(aes(x=nClades, y=bacLoad)) +
geom_jitter(alpha=0.25) +
geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 1.5, label.x = 0, label.sep = "\n") +
facet_wrap(~Cohort) +
scale_x_continuous(breaks = seq(0,6,1)) +
labs(x="Clades per Sample", y="Microbial Load")
loadVClades
Is this a function of sequencing depth?
gardCladeAbU %>%
spread(var, relativeAbundance) %>%
select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
mutate_at(clades, ~case_when(.x>0~1,
.x==0~0)) %>%
gather("var", "abundance", clades) %>%
group_by(SampleID, Cohort) %>%
summarize(nClades=sum(abundance)) %>%
left_join(sgDF[,c("SampleID", "bacLoad", "Sickle_pairs")], by="SampleID") %>%
filter(bacLoad<2) %>%
ggplot(aes(x=nClades, y=Sickle_pairs)) +
geom_jitter(alpha=0.5) +
geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 6e07, label.x = 0, label.sep = "\n") +
facet_wrap(~Cohort) +
scale_x_continuous(breaks = seq(0,6,1)) +
labs(x="Clades per Sample", y="Library Size") # library size is QC-filtered reads
## `summarise()` has grouped output by 'SampleID'. You can override using the `.groups` argument.
## `geom_smooth()` using formula 'y ~ x'
Microbial Reads
gardCladeAbU %>%
spread(var, relativeAbundance) %>%
select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
mutate_at(clades, ~case_when(.x>0~1,
.x==0~0)) %>%
gather("var", "abundance", clades) %>%
group_by(SampleID, Cohort) %>%
summarize(nClades=sum(abundance)) %>%
left_join(sgDF[,c("SampleID", "bacLoad", "bbmapFiltered_pairs")], by="SampleID") %>%
filter(bacLoad<2) %>%
ggplot(aes(x=nClades, y=bbmapFiltered_pairs)) +
geom_jitter(alpha=0.5) +
geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 2e07, label.x = 0, label.sep = "\n") +
facet_wrap(~Cohort) +
scale_x_continuous(breaks = seq(0,6,1)) +
labs(x="Clades per Sample", y="Microbial Reads")
allUnchGardSamples <- gardCladeAbU %>%
filter(var=="Uncharacterized_Gardnerella",
relativeAbundance>0) %>%
.$SampleID
#write_lines(allUnchGardSamples, "/Users/hlberman/Desktop/unchGardSamples.txt")
sgDF %>%
mutate(isUnch=case_when(SampleID %in% allUnchGardSamples~TRUE,
SampleID %!in% allUnchGardSamples~FALSE)) %>%
ggplot(aes(x=isUnch, y=bbmapFiltered_pairs, color=Cohort, shape=Cohort)) +
geom_boxplot(alpha=0, position = position_dodge(width=1)) +
geom_quasirandom(alpha=0.5, dodge.width = 1) +
geom_hline(yintercept = 1e05, linetype=2, color="black") +
geom_hline(yintercept = 16334, linetype=2, color="gray") +
scale_y_log10() +
labs(x="Uncharacterized Gardnerella", y="Microbial Reads")
minReads <- min(sgDF$bbmapFiltered_pairs)
minReads
## [1] 16334
Rarefy to 100,000 reads per sample
Assess read counts and get samples with 100,000 reads
rarefiedReadCountsDF <- "./rarefied_gard_counts/rarefiedReadCounts.csv" %>%
read_csv
k100Samples0 <- rarefiedReadCountsDF %>%
filter(rarefiedReads==100000) %>%
.$SampleID
# add names for samples in MOMS-PI Enriched
k100Samples <- k100Samples0[k100Samples0 %in% HMP2HighGardSamples] %>%
paste0(., "_E") %>%
c(k100Samples0, .)
Import MetaPhlAn tables
rarefiedMetaphlanOuts <- "./rarefied_gard_counts/rarefiedMergedMetaphlanAbundance.tsv" %>%
read_tsv %>%
filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
select(Species, everything(), -ID) %>%
#as.data.frame() %>%
dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]+")) %>%
column_to_rownames(var="Species") %>%
t %>%
as.data.frame() %>%
mutate_all(funs(./100)) %>%
rownames_to_column(var="SampleID")
rarefiedGardProportions <- rarefiedMetaphlanOuts %>%
select(SampleID, Gardnerella_vaginalis) %>%
replace_na(list(Gardnerella_vaginalis=0))
Proportion of clades
rarefiedCladeRelativeAbundance <- "./rarefied_gard_counts/gardCladeRelativeAbundance.tsv" %>%
read_tsv %>%
mutate(SampleID=as.character(SampleID)) %>%
select(-propClade) %>%
filter(n>=2) %>% #count as present if >=2 reads are mapped to that clade
group_by(SampleID) %>%
mutate(propClade=n/sum(n)) %>%
ungroup() %>%
select(-n)
rarefiedGardDF0 <- rarefiedCladeRelativeAbundance %>%
full_join(rarefiedGardProportions) %>%
mutate(relativeAbundance=propClade*Gardnerella_vaginalis) %>%
select(-propClade) %>%
spread(Clade, relativeAbundance) %>%
select(-`<NA>`) %>%
replace_na(list(C1=0, C2=0, C3=0, C4=0, C5=0, C6=0)) %>%
right_join(sgDF[,c("SampleID", "SubjectID", "Cohort", "TermPre")]) %>%
mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>0&(C1+C2+C3+C4+C5+C6==0)~Gardnerella_vaginalis,
Gardnerella_vaginalis==0~0,
Gardnerella_vaginalis>0&(C1+C2+C3+C4+C5+C6>0)~0))
# create rows for MOMS-PI Enriched Samples
rarefiedGardDF <- rarefiedGardDF0 %>%
filter(SubjectID %in% HMP2highGardSubjects) %>%
mutate(Cohort="MOMS-PI Enriched",
SampleID=paste0(SampleID, "_E"),
SubjectID=paste0(SubjectID, "_E")) %>%
full_join(rarefiedGardDF0)
loadVrarefiedClades <- rarefiedGardDF %>%
filter(SampleID %in% k100Samples) %>%
select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
mutate_at(clades, ~case_when(.x>0~1,
.x==0~0)) %>%
mutate(nClades=(C1+C2+C3+C4+C5+C6),
Cohort=factor(Cohort, cohorts)) %>%
left_join(bacLoad, by="SampleID") %>%
filter(bacLoad<2) %>%
ggplot(aes(x=nClades, y=bacLoad)) +
geom_jitter(alpha=0.25) +
geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 1.5, label.x = 0, label.sep = "\n") +
facet_wrap(~Cohort) +
scale_x_continuous(breaks = seq(0,6,1)) +
labs(x="Clades per Sample", y="Microbial Load")
loadVrarefiedClades
#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS6_rarefiedCladesPerSampleVsBacLoad.png", sep = "_")))
bacloadVsTaxWilcox <- mDFtoiClades %>%
gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobes)) %>%
select(SampleID, Cohort, Taxon, relativeAbundance) %>%
left_join(bacLoad, by="SampleID") %>%
filter(bacLoad<2) %>%
mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
relativeAbundance>=0.001~"+"),
relativeAbundance=factor(relativeAbundance, levels=c("+", "-")),
alternative=case_when(Taxon %in% c("Gardnerella_vaginalis", clades, anaerobes)~"greater",
Taxon %in% lactos~"less"),
Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = c(abbrevs)),
Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI")),
x=paste0(Cohort, Taxon)) %>%
filter(x!="UAB Enr.Lg") %>%
select(-x) %>%
group_by(alternative) %>%
nest %>%
mutate(data=map(data, ~group_by(.x, Cohort, Taxon))) %>%
mutate(wilcox_test=map2(data, alternative, ~wilcox_test(data=.x, bacLoad~relativeAbundance, alternative=.y))) %>%
select(wilcox_test) %>%
unnest %>%
adjust_pvalue(method = "BH") %>%
mutate(p.star=case_when(p.adj>0.05~"",
p.adj<0.05&p.adj>0.01~"*",
p.adj<0.01&p.adj>0.001~"**",
p.adj<0.001~"***"))
Plot
#fig.width=7, fig.height=2
mDFtoiClades %>%
gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobes)) %>%
select(SampleID, Cohort, Taxon, relativeAbundance) %>%
left_join(bacLoad, by="SampleID") %>%
filter(bacLoad<2) %>%
mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
relativeAbundance>=0.001~"+"),
Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = c(abbrevs)),
Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI"))) %>%
ggplot(aes(x=relativeAbundance, y=bacLoad)) +
geom_violin(aes(color=relativeAbundance, fill=relativeAbundance), alpha=0.5) +
geom_text(data =bacloadVsTaxWilcox, aes(y=1, x=1.5, label=p.star), color="black", size=4, inherit.aes = FALSE) +
binaryColors +
facet_grid(Cohort~Taxon) +
scale_y_log10() +
theme(text = element_text(size=15),
axis.text = element_text(size = 12),
legend.position = "none") +
labs(x="", y="Microbial Load")
#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS7_presAbsVsBacLoad.png", sep = "_")))
Microbial load vs. clades per sample and short list microbial load vs. presence-absence
#fig.width=5, fig.height=1.5
bacLoadpresAbs <- mDFtoiClades %>%
gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobesSL)) %>%
select(SampleID, Cohort, Taxon, relativeAbundance) %>%
left_join(bacLoad, by="SampleID") %>%
filter(bacLoad<2) %>%
mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
relativeAbundance>=0.001~"+"),
Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobesSL), labels = c(abbrevsSL)),
Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI"))) %>%
ggplot(aes(x=relativeAbundance, y=bacLoad)) +
geom_violin(aes(color=relativeAbundance, fill=relativeAbundance), alpha=0.5) +
geom_text(data=subset(bacloadVsTaxWilcox, Taxon %in% abbrevsSL), aes(y=1, x=1.5, label=p.star), color="black", size=4, inherit.aes = FALSE) +
binaryColors +
facet_grid(Cohort~Taxon) +
scale_y_log10() +
theme(text = element_text(size=15),
axis.text = element_text(size = 12),
strip.text.y = element_text(size=8),
legend.position = "none") +
labs(x="", y="Microbial Load")
plot_grid(loadVClades, bacLoadpresAbs, nrow = 1, rel_widths = c(1,2.5), labels = c("A", "B"), label_size = 15)
#ggsave(file.path(figureOut, paste(Sys.Date(), "Figure4_gardAndBacLoad.png", sep="_")))
Get the dominant taxon in each sample for organizing – using clades. Define as the taxon with the greatest relative abundance. If no taxon has a relative abundance greater than 30%, specify as “no type”
dominantTax <- gardCladeAb %>%
spread(Clade, relativeAbundance) %>%
right_join(mDF, by=c("SampleID", "Cohort")) %>%
select(-Gardnerella_vaginalis) %>%
mutate_at(vars(clades), replace_na, 0) %>%
gather("dominantTax", "abundance", !one_of(c("SampleID", "Cohort"))) %>%
group_by(SampleID) %>%
filter(abundance==max(abundance)) %>%
mutate(dominantTax=case_when(abundance<.3~"No type",
abundance>=.3~dominantTax)) %>%
select(-abundance) %>%
ungroup
unique(dominantTax$dominantTax)
## [1] "C1" "No type"
## [3] "C2" "C3"
## [5] "C4" "C5"
## [7] "C6" "Corynebacterium_sp_HFH0082"
## [9] "Bifidobacterium_longum" "Atopobium_vaginae"
## [11] "Bacteroides_fragilis" "Prevotella_amnii"
## [13] "Prevotella_bivia" "Prevotella_timonensis"
## [15] "Staphylococcus_epidermidis" "Lactobacillus_crispatus"
## [17] "Lactobacillus_delbrueckii" "Lactobacillus_gasseri"
## [19] "Lactobacillus_iners" "Lactobacillus_jensenii"
## [21] "Escherichia_coli" "Haemophilus_parainfluenzae"
## [23] "Mycoplasma_hominis" "Lactobacillus_phage_Lv_1"
## [25] "Alphapapillomavirus_9" "Staphylococcus_aureus"
## [27] "Lactobacillus_helveticus" "Bradyrhizobium_sp_DFCI_1"
## [29] "Enterobacter_cloacae" "Alphapapillomavirus_11"
## [31] "Alphapapillomavirus_14" "Alphapapillomavirus_3"
## [33] "Alphapapillomavirus_5"
length(unique(dominantTax$dominantTax))
## [1] 33
Create dataframe for order of dominant types to order in barplot figure
dominantTaxOrder <- data.frame(dominantTax=c(clades, lactos, "Atopobium_vaginae", "Prevotella_bivia", "Prevotella_amnii", "Prevotella_timonensis", "Mycoplasma_hominis", "Corynebacterium_sp_HFH0082", "Bifidobacterium_longum", "Bacteroides_fragilis", "Staphylococcus_epidermidis", "Lactobacillus_delbrueckii", "Escherichia_coli", "Haemophilus_parainfluenzae", "Lactobacillus_phage_Lv_1", "Alphapapillomavirus_9", "Staphylococcus_aureus", "Lactobacillus_helveticus", "Bradyrhizobium_sp_DFCI_1", "Enterobacter_cloacae", "Alphapapillomavirus_11", "Alphapapillomavirus_14", "Alphapapillomavirus_3", "Alphapapillomavirus_5", "No type"),
dominantTaxOrder=c(1:33))
Stacked bar plot
# Ordered by dominant taxon
taxaBarPlot <- mDFtoiClades %>%
select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
filter(SampleID!="6744034",
SampleID!="6744034_E") %>% # this sample is 100% uncharacterized gard and therefore cannot be displayed in this figure
mutate(Other=1-C1-C2-C3-C4-C5-C6-Lactobacillus_crispatus-Lactobacillus_gasseri-Lactobacillus_jensenii-Lactobacillus_iners-Atopobium_vaginae-Mycoplasma_hominis-Prevotella_bivia-Ureaplasma_parvum) %>%
gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
mutate(relativeAbundance=na_if(relativeAbundance, 0),
Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL))) %>%
left_join(dominantTax, by=c("SampleID", "Cohort")) %>%
left_join(dominantTaxOrder, by="dominantTax") %>%
group_by(SampleID) %>%
ggplot(aes(x=reorder(reorder(SampleID, relativeAbundance, na.rm=TRUE), dominantTaxOrder, FUN=min), y=relativeAbundance, color=Taxon, fill=Taxon)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
facet_wrap(~Cohort, scales = "free") +
labs(x="Sample", y="Relative Abundance") +
taxColorsOther
taxaBarPlot
# ordered by microbial load
mDFtoiClades %>%
select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
mutate(Other=1-(C1+C2+C3+C4+C5+C6+Lactobacillus_crispatus+Lactobacillus_gasseri+Lactobacillus_jensenii+Lactobacillus_iners+Atopobium_vaginae+Mycoplasma_hominis+Prevotella_bivia+Ureaplasma_parvum)) %>%
gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
left_join(bacLoad, by="SampleID") %>%
mutate(Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL)),
absoluteAbundance=relativeAbundance*bacLoad) %>%
ggplot(aes(x=reorder(SampleID, absoluteAbundance, FUN=sum), y=relativeAbundance, color=Taxon, fill=Taxon)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
facet_wrap(~Cohort, scales="free_x") +
labs(x="Sample", y="Relative Abundance") +
taxColorsOther
# By absolute abundance
aaBarPlot <- mDFtoiClades %>%
select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
mutate(Other=1-C1-C2-C3-C4-C5-C6-Lactobacillus_crispatus-Lactobacillus_gasseri-Lactobacillus_jensenii-Lactobacillus_iners-Atopobium_vaginae-Mycoplasma_hominis-Prevotella_bivia-Ureaplasma_parvum) %>%
gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
left_join(bacLoad, by="SampleID") %>%
mutate(Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL)),
absoluteAbundance=relativeAbundance*bacLoad) %>%
left_join(bacLoad, by="SampleID") %>%
ggplot(aes(x=reorder(SampleID, absoluteAbundance, FUN=sum), y=absoluteAbundance, color=Taxon, fill=Taxon)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
facet_wrap(~Cohort, scales="free_x") +
labs(x="Sample", y="Absolute Abundance") +
taxColorsOther
aaBarPlot
aaBarPlot +
coord_cartesian(ylim=c(0,1.5))
What is in the outlier samples?
outlierSamples <- bacLoad %>%
filter(bacLoad > 2) %>%
.$SampleID
outlierSamples
## [1] "4009835178" "1005601348" "6744677" "6743991" "6744354"
## [6] "6744871" "6744459" "6744677_E" "6744871_E"
Get top 5 taxa from each outlier sample
mDF %>%
filter(SampleID %in% outlierSamples) %>%
gather("Taxon", "Abundance", !contains(c("SampleID", "Cohort"))) %>%
group_by(SampleID) %>%
top_n(5) %>%
arrange(-Abundance) %>%
split(.$SampleID)
## $`1005601348`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 1005601348 Stanford Enriched Prevotella_timonensis 0.300
## 2 1005601348 Stanford Enriched Gardnerella_vaginalis 0.196
## 3 1005601348 Stanford Enriched Lactobacillus_gasseri 0.106
## 4 1005601348 Stanford Enriched Varibaculum_cambriense 0.0689
## 5 1005601348 Stanford Enriched Clostridiales_bacterium_BV3C26 0.0526
##
## $`4009835178`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 4009835178 UAB Enriched Gardnerella_vaginalis 0.243
## 2 4009835178 UAB Enriched Atopobium_vaginae 0.142
## 3 4009835178 UAB Enriched Prevotella_timonensis 0.0688
## 4 4009835178 UAB Enriched Finegoldia_magna 0.0601
## 5 4009835178 UAB Enriched Prevotella_disiens 0.0405
##
## $`6743991`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 6743991 MOMS-PI Prevotella_timonensis 0.185
## 2 6743991 MOMS-PI Porphyromonas_bennonis 0.168
## 3 6743991 MOMS-PI Porphyromonas_asaccharolytica 0.118
## 4 6743991 MOMS-PI Peptoniphilus_sp_BV3AC2 0.0732
## 5 6743991 MOMS-PI Porphyromonas_uenonis 0.0645
##
## $`6744354`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 6744354 MOMS-PI Prevotella_amnii 0.405
## 2 6744354 MOMS-PI Gardnerella_vaginalis 0.187
## 3 6744354 MOMS-PI Prevotella_timonensis 0.163
## 4 6744354 MOMS-PI Porphyromonas_uenonis 0.0635
## 5 6744354 MOMS-PI Clostridiales_genomosp_BVAB3 0.0529
##
## $`6744459`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 6744459 MOMS-PI Prevotella_timonensis 0.147
## 2 6744459 MOMS-PI Oligella_urethralis 0.135
## 3 6744459 MOMS-PI Bifidobacterium_breve 0.0911
## 4 6744459 MOMS-PI Peptoniphilus_sp_BV3AC2 0.0813
## 5 6744459 MOMS-PI Gardnerella_vaginalis 0.0658
##
## $`6744677`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 6744677 MOMS-PI Gardnerella_vaginalis 0.733
## 2 6744677 MOMS-PI Prevotella_amnii 0.172
## 3 6744677 MOMS-PI Megasphaera_genomosp_type_1 0.0398
## 4 6744677 MOMS-PI Atopobium_vaginae 0.0184
## 5 6744677 MOMS-PI Lactobacillus_iners 0.0155
##
## $`6744677_E`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 6744677_E MOMS-PI Enriched Gardnerella_vaginalis 0.733
## 2 6744677_E MOMS-PI Enriched Prevotella_amnii 0.172
## 3 6744677_E MOMS-PI Enriched Megasphaera_genomosp_type_1 0.0398
## 4 6744677_E MOMS-PI Enriched Atopobium_vaginae 0.0184
## 5 6744677_E MOMS-PI Enriched Lactobacillus_iners 0.0155
##
## $`6744871`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 6744871 MOMS-PI Gardnerella_vaginalis 0.794
## 2 6744871 MOMS-PI Megasphaera_genomosp_type_1 0.0679
## 3 6744871 MOMS-PI Dialister_micraerophilus 0.0357
## 4 6744871 MOMS-PI Prevotella_amnii 0.0324
## 5 6744871 MOMS-PI Atopobium_vaginae 0.0307
##
## $`6744871_E`
## # A tibble: 5 × 4
## # Groups: SampleID [1]
## SampleID Cohort Taxon Abundance
## <chr> <fct> <chr> <dbl>
## 1 6744871_E MOMS-PI Enriched Gardnerella_vaginalis 0.794
## 2 6744871_E MOMS-PI Enriched Megasphaera_genomosp_type_1 0.0679
## 3 6744871_E MOMS-PI Enriched Dialister_micraerophilus 0.0357
## 4 6744871_E MOMS-PI Enriched Prevotella_amnii 0.0324
## 5 6744871_E MOMS-PI Enriched Atopobium_vaginae 0.0307
Many have Gardnerella, all have Prevotella timonensis or P. amnii
#fig.height=5, fig.width=6.5
jaccards <- mDFtoiClades %>%
select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobes) %>%
mutate_if(is.numeric, ~case_when(.x>=0.001~1,
.x<0.001~0)) %>%
split(.$Cohort) %>%
map(~column_to_rownames(.x, var="SampleID")) %>%
map(~select(.x, -Cohort)) %>%
map(t) %>%
map(~vegdist(.x, method="jaccard")) %>%
map(as.matrix)
jaccards$`Stanford Enriched`[lower.tri(jaccards$`Stanford Enriched`, diag = TRUE)] <- NA
jaccards$`UAB Enriched`[lower.tri(jaccards$`UAB Enriched`, diag = TRUE)] <- NA
jaccards$`MOMS-PI Enriched`[lower.tri(jaccards$`MOMS-PI Enriched`, diag = TRUE)] <- NA
jaccards$`MOMS-PI`[lower.tri(jaccards$`MOMS-PI`, diag = TRUE)] <- NA
jaccards %>%
map(melt) %>%
map2(names(.), ~mutate(.x, cohort=.y)) %>%
purrr::reduce(full_join) %>%
mutate(Var1=factor(Var1, levels=c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = abbrevs),
Var2=factor(Var2, levels=rev(c("Gardnerella_vaginalis", clades, lactos, anaerobes)), labels = rev(abbrevs)),
cohort=factor(cohort, levels = cohorts)) %>%
filter(!(Var1=="TG" & Var2 %in% clades),
!is.na(value)) %>%
ggplot(aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_x_discrete(drop=FALSE) +
scale_y_discrete(drop=FALSE) +
scale_fill_gradient2(low = "#56B4E9", mid = "gray", high = "#D55E00", midpoint = 0.5) +
theme(axis.text.x = element_text(angle=45, hjust=1),
strip.text.x = element_text(size = 13)) +
facet_wrap(~cohort) +
labs(x="", y="", fill="Jaccard Distance")
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS8_jacFourCohort.png", sep="_")), height=6, width=7.5, units="in")
Taxon Abundances and short list of co-occurrences
#fig.width=5.25, fig.height=2
jaccardsSL <- mDFtoiClades %>%
select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobesSL) %>%
mutate_if(is.numeric, ~case_when(.x>=0.001~1,
.x<0.001~0)) %>%
split(.$Cohort) %>%
map(~column_to_rownames(.x, var="SampleID")) %>%
map(~select(.x, -Cohort)) %>%
map(t) %>%
map(~vegdist(.x, method="jaccard")) %>%
map(as.matrix)
jaccardsSL$`Stanford Enriched`[lower.tri(jaccardsSL$`Stanford Enriched`, diag = TRUE)] <- NA
jaccardsSL$`UAB Enriched`[lower.tri(jaccardsSL$`UAB Enriched`, diag = TRUE)] <- NA
jaccardsSL$`MOMS-PI Enriched`[lower.tri(jaccardsSL$`MOMS-PI Enriched`, diag = TRUE)] <- NA
jaccardsSL$`MOMS-PI`[lower.tri(jaccardsSL$`MOMS-PI`, diag = TRUE)] <- NA
jaccardFigure <- jaccardsSL %>%
map(melt) %>%
map2(names(.), ~mutate(.x, cohort=.y)) %>%
purrr::reduce(full_join) %>%
mutate(Var1=factor(Var1, levels=c("Gardnerella_vaginalis", clades, lactos, anaerobesSL), labels = abbrevsSL),
Var2=factor(Var2, levels=rev(c("Gardnerella_vaginalis", clades, lactos, anaerobesSL)), labels = rev(abbrevsSL)),
cohort=factor(cohort, levels=cohorts)) %>%
filter(!(Var1=="TG" & Var2 %in% clades),
!is.na(value)) %>%
ggplot(aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_x_discrete(drop=FALSE) +
scale_y_discrete(drop=FALSE) +
scale_fill_gradient2(low = "#56B4E9", mid = "gray", high = "#D55E00", midpoint = 0.5) +
theme(axis.text.x = element_text(angle=45, hjust=1, size=7),
axis.text.y = element_text(size=7)) +
facet_wrap(~cohort) +
coord_fixed() +
labs(x="", y="", fill="Jaccard Distance")
plot_grid(taxaBarPlot, jaccardFigure, nrow=1, labels = c("A", "B"), label_size = 15)
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "Figure5_microbiomeAndJaccard.png", sep="_")))
#plot showing clade relative abundance in term vs. preterm birth in all cohorts
allCohortsCladeRelativePTBplot <- gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella") %>% #SampleID %!in% unchGardSamples,
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
mutate(var=factor(var, levels = clades, labels = clades),
Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
binaryColors +
facet_grid(Cohort~var, scales="free_y") +
theme(legend.position = "none") +
labs(x=NULL, y="Relative Abundance")
allCohortsCladeRelativePTBplot
# create function to mutate dataframe to create differences and then observe transformations
meanDifferenceTransformations <- function(inputDF, vars){
#get list of pairs
pairs <- combn(vars, 2) %>%
t %>%
as.data.frame %>%
mutate(pairs=paste(V1, V2, sep="-")) %>%
.$pairs
# calculate differences and transformations of values
diffs_trans <- inputDF %>%
mutate(meanAbs=set_names(meanAbs, var)) %>%
nest(diffs=c(var, meanAbs)) %>%
mutate(diffs=map(diffs, ~abs(outer(.x$meanAbs, .x$meanAbs, `-`)))) %>% #compute differences
mutate(diffs=map(diffs, data.frame)) %>%
mutate(diffs=map(diffs, ~rownames_to_column(.x, var="var1"))) %>%
mutate(diffs=map(diffs, ~gather(.x, "var2", "diff", vars))) %>%
mutate(diffs=map(diffs, ~transmute(.x, pair=paste(var1, var2, sep="-"), diff=diff))) %>%
mutate(diffs=map(diffs, ~filter(.x, pair %in% pairs))) %>% # filter repeats
mutate(diffs=map(diffs, ~mutate(.x, sqrt=sqrt(diff), #square root transformation
frthrt=diff^(1/4), #fourth root transformation
log=log10(diff + 1e-5)))) %>% #log10 transformation with pseudocount
unnest
# plot histograms
plots <- diffs_trans %>%
gather("Transformation", "Value", c(diff, sqrt, frthrt, log)) %>%
mutate(Transformation=factor(Transformation, levels=c("diff", "sqrt", "frthrt", "log"), labels=c("Difference", "Square Root", "Fourth Root", "log10 with Pseudocount"))) %>%
ggplot(aes(x=Value)) +
geom_histogram() +
facet_wrap(~Transformation, scales="free")
if(vars==clades){
# perform shapiro-wilk test for normality. null hypothesis is normality.
shapiro_results <- map(list(Difference=diffs_trans$diff, sqrt=diffs_trans$sqrt, fourthrt=diffs_trans$frthrt, log=diffs_trans$log), shapiro_test) %>%
map2(., names(.), ~mutate(.x, transformation=.y)) %>%
purrr::reduce(full_join) %>%
select(transformation, statistic, p.value)
return(list(diffs_trans=diffs_trans, plots=plots, shapiro_results=shapiro_results))
}
if(vars==genomos){
return(list(diffs_trans=diffs_trans, plots=plots))
}
}
Assess and choose transformation
cladeRelativeTransfs <- gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
filter(!is.na(TermPre), var!="Uncharacterized_Gardnerella", var!="Gardnerella_vaginalis") %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, meanAbs=mean(relativeAbundance)) %>%
meanDifferenceTransformations(., clades)
cladeRelativeTransfs$plots
cladeRelativeTransfs$shapiro_results %>%
formattable()
| transformation | statistic | p.value |
|---|---|---|
| Difference | 0.6367996 | 3.208264e-63 |
| sqrt | 0.8608028 | 1.845471e-46 |
| fourthrt | 0.9725260 | 3.774719e-24 |
| log | 0.8951225 | 5.074615e-42 |
Choose fourth root transformation
# function for performing logistic regression on mean differences of clade abundances
meanDiffLogisticRegression <- function(inputDF, transformation){
# get cohort Ns
coh_ns <- gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
filter(!is.na(TermPre), var!="Uncharacterized_Gardnerella", var!="Gardnerella_vaginalis") %>%
group_by(Cohort) %>%
summarize(cohort_n=n_distinct(SubjectID))
# perform logitic regressions tests
logiOuts <- inputDF %>%
gather("transformation", "difference", c(diff, sqrt, frthrt, log)) %>% # filter to choose values with appropriate transformation
filter(transformation==transformation) %>%
select(Cohort, TermPre, pair, difference) %>%
mutate(TermPre=case_when(TermPre=="T"~0,
TermPre=="P"~1)) %>%
split(list(.$Cohort, .$pair)) %>%
map(~glm(data=.x, TermPre~difference, family="binomial")) %>%
map(tidy) %>%
map2(., names(.), ~mutate(.x, Cohort_Pair=.y)) %>%
map(~separate(.x, Cohort_Pair, c("Cohort", "Pair"), sep="\\.")) %>%
purrr::reduce(full_join, by = c("term", "estimate", "std.error", "statistic", "p.value", "Cohort", "Pair")) %>%
left_join(coh_ns, by="Cohort") %>%
mutate(std.dev=std.error*sqrt(cohort_n))
logiPlots <- logiOuts %>%
filter(term=="difference") %>%
mutate(Cohort=factor(Cohort, levels=cohorts, labels = cohortAbbrevs),
Pair=factor(Pair, levels = rev(unique(logiOuts$Pair)))) %>%
ggplot(aes(y=Pair, x=estimate, xmin=estimate-std.dev, xmax=estimate+std.dev)) +
geom_pointrange() +
geom_vline(aes(xintercept=0), linetype=2, color="gray", inherit.aes=FALSE) +
facet_grid(~Cohort)
return(list(logiOuts=logiOuts, logiPlots=logiPlots))
}
cladeRelMeanDiffRegression <- meanDiffLogisticRegression(cladeRelativeTransfs$diffs_trans, "frthrt")
cladeRelMeanDiffRegression$logiOuts %>%
filter(term=="difference",
p.value<0.05) %>%
nrow
## [1] 0
Figure S10: All cohorts Gardnerella and PTB logistic regression
#fig.height=2, fig.width=5
plot_grid(allCohortsCladeRelativePTBplot, cladeRelMeanDiffRegression$logiPlots, nrow=1, rel_widths = c(1, 0.75), align = "v", labels = c("A", "B"), label_size = 15)
#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS10_allCladesLogisticRegressionPTB.png", sep="_")))
Plot abundances
#plot showing clade absolute abundance in term vs. preterm birth in all cohorts
allCohortsCladeAbsolutePTBplot <- gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella", bacLoad<2) %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
mutate(var=factor(var, levels = clades, labels = clades),
Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
binaryColors +
facet_grid(Cohort~var, scales="free_y") +
theme(legend.position = "none") +
labs(x=NULL, y="Absolute Abundance")
allCohortsCladeAbsolutePTBplot
Assess and choose transformation
cladeAbsoluteTransfs <- gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella", bacLoad<2) %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, meanAbs=mean(absoluteAbundance)) %>%
meanDifferenceTransformations(., clades)
cladeAbsoluteTransfs$plots
cladeAbsoluteTransfs$shapiro_results %>%
formattable()
| transformation | statistic | p.value |
|---|---|---|
| Difference | 0.4769455 | 2.720409e-70 |
| sqrt | 0.7729894 | 1.124654e-54 |
| fourthrt | 0.9456048 | 1.289173e-32 |
| log | 0.9506172 | 2.456043e-31 |
Choose log10 + pseudocount transformation
Perform logistic regression on mean differences to see if they vary by term or preterm birth Null hypothesis is that coefficients = 0
cladeAbsMeanDiffRegression <- meanDiffLogisticRegression(cladeAbsoluteTransfs$diffs_trans, "log")
cladeAbsMeanDiffRegression$logiOuts %>%
filter(term=="difference",
p.value<0.05) %>%
nrow
## [1] 0
#fig.height=2, fig.width=5
plot_grid(allCohortsCladeAbsolutePTBplot, cladeAbsMeanDiffRegression$logiPlots, nrow=1, rel_widths = c(1, 0.75), align = "v")
Due to enrichment in Gardnerella in Stanford and UAB clades. The following analyses will focus on the MOMS-PI cohorts. Create data frame:
momsptbDF <- gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
filter(str_detect(Cohort, "MOMS-PI"),
!is.na(TermPre)) %>%
mutate(Cohort=as.character(Cohort))
Wilcox Ranked Sum in MOMS-PI cohort ONLY
cladeRelativeWilcoxResults <- momsptbDF %>%
filter(Cohort=="MOMS-PI",
var!="Uncharacterized_Gardnerella") %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Relative=mean(relativeAbundance)) %>%
group_by(var) %>%
wilcox_test(Relative~TermPre, alternative="greater")
cladeRelativeWilcoxResults %>%
arrange(p) %>%
formattable()
| var | .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|---|
| Gardnerella_vaginalis | Relative | P | T | 43 | 90 | 2409.0 | 0.0114 |
| C3 | Relative | P | T | 43 | 90 | 2279.0 | 0.0488 |
| C1 | Relative | P | T | 43 | 90 | 2141.0 | 0.1610 |
| C6 | Relative | P | T | 43 | 90 | 2080.0 | 0.2250 |
| C5 | Relative | P | T | 43 | 90 | 2057.5 | 0.2730 |
| C2 | Relative | P | T | 43 | 90 | 2009.0 | 0.3610 |
| C4 | Relative | P | T | 43 | 90 | 1731.5 | 0.8370 |
Wilcoxon rank sum test: Absolute abundance
cladeAbsoluteWilcoxResults <- momsptbDF %>%
left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
filter(Cohort=="MOMS-PI", bacLoad<2,
var!="Uncharacterized_Gardnerella") %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Absolute=mean(absoluteAbundance)) %>%
group_by(var) %>%
wilcox_test(Absolute~TermPre, alternative="greater")
cladeAbsoluteWilcoxResults %>%
arrange(p) %>%
formattable()
| var | .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|---|
| Gardnerella_vaginalis | Absolute | P | T | 43 | 90 | 2357.0 | 0.0213 |
| C3 | Absolute | P | T | 43 | 90 | 2222.0 | 0.0835 |
| C1 | Absolute | P | T | 43 | 90 | 2179.0 | 0.1200 |
| C2 | Absolute | P | T | 43 | 90 | 2108.0 | 0.2010 |
| C5 | Absolute | P | T | 43 | 90 | 2088.5 | 0.2250 |
| C6 | Absolute | P | T | 43 | 90 | 2070.5 | 0.2390 |
| C4 | Absolute | P | T | 43 | 90 | 1933.5 | 0.5040 |
Plot
# save p values
cladeWilcoxResults <- cladeRelativeWilcoxResults %>%
full_join(cladeAbsoluteWilcoxResults, by = c("var", ".y.", "group1", "group2", "n1", "n2", "statistic", "p")) %>%
mutate(p.star=case_when(p>0.05~"",
p<0.05&p>0.01~"*",
p<0.01&p>0.001~"**",
p<0.001~"***"),
var=factor(var, levels = c(clades, "Gardnerella_vaginalis"), labels = c(clades, "Total")))
Relative Abundance Plot:
momspiCladeAbundancePTBPlot <- momsptbDF %>%
filter(Cohort=="MOMS-PI",
var!="Uncharacterized_Gardnerella") %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
mutate(var=factor(var, levels = c(clades, "Gardnerella_vaginalis"), labels = c(clades, "Total"))) %>%
ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
geom_text(data = cladeWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
binaryColors +
facet_grid(.~var) +
theme(legend.position = "none",
strip.text.x = element_text(size=13),
strip.text.y = element_text(size=9),
axis.title=element_text(size=16)) +
labs(x=NULL, y="Relative\nAbundance")
momspiCladeAbundancePTBPlot
Absolute Abundance Plot:
momsptbDF %>%
filter(Cohort=="MOMS-PI",
var!="Uncharacterized_Gardnerella") %>%
left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
filter(bacLoad<2) %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
mutate(var=factor(var, levels = c(clades, "Gardnerella_vaginalis"), labels = c(clades, "Total"))) %>%
ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
geom_text(data = cladeWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
binaryColors +
facet_grid(.~var) +
theme(legend.position = "none",
strip.text.x = element_text(size=13),
strip.text.y = element_text(size=9),
axis.title=element_text(size=16)) +
labs(x=NULL, y="Absolute Abundance")
Wilcoxon rank sum test for genomospecies, MOMS-PI cohort ONLY
# Relative abundance
genomoRelativeWilcoxResults <- gardGenomoAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
filter(Cohort=="MOMS-PI",
!is.na(TermPre),
var!="Uncharacterized_Gardnerella") %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Relative=mean(relativeAbundance)) %>%
group_by(var) %>%
wilcox_test(Relative~TermPre, alternative="greater")
# Absolute abundance
genomoAbsoluteWilcoxResults <- gardGenomoAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
filter(Cohort=="MOMS-PI",
!is.na(TermPre),
var!="Uncharacterized_Gardnerella") %>%
mutate(Cohort=as.character(Cohort)) %>%
left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
filter(Cohort=="MOMS-PI", bacLoad<2) %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Absolute=mean(absoluteAbundance)) %>%
group_by(var) %>%
wilcox_test(Absolute~TermPre, alternative="greater")
genomoRelativeWilcoxResults %>%
arrange(p) %>%
formattable
| var | .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|---|
| GS10 | Relative | P | T | 43 | 90 | 2498.0 | 0.000208 |
| GS9 | Relative | P | T | 43 | 90 | 2524.0 | 0.000298 |
| GS8 | Relative | P | T | 43 | 90 | 2445.0 | 0.001940 |
| GS14 | Relative | P | T | 43 | 90 | 2368.0 | 0.011100 |
| Gardnerella_vaginalis | Relative | P | T | 43 | 90 | 2409.0 | 0.011400 |
| GS2 | Relative | P | T | 43 | 90 | 2207.0 | 0.084200 |
| GS12 | Relative | P | T | 43 | 90 | 2127.0 | 0.160000 |
| GS13 | Relative | P | T | 43 | 90 | 2078.0 | 0.208000 |
| GS11 | Relative | P | T | 43 | 90 | 2023.0 | 0.277000 |
| GS7 | Relative | P | T | 43 | 90 | 2023.5 | 0.321000 |
| GS1 | Relative | P | T | 43 | 90 | 2019.0 | 0.341000 |
| GS4 | Relative | P | T | 43 | 90 | 1971.0 | 0.426000 |
| GS6 | Relative | P | T | 43 | 90 | 1969.0 | 0.436000 |
| GS5 | Relative | P | T | 43 | 90 | 1913.0 | 0.544000 |
| GS3 | Relative | P | T | 43 | 90 | 1869.0 | 0.634000 |
genomoAbsoluteWilcoxResults %>%
arrange(p) %>%
formattable
| var | .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|---|
| GS10 | Absolute | P | T | 43 | 90 | 2502.0 | 0.000189 |
| GS9 | Absolute | P | T | 43 | 90 | 2514.0 | 0.000369 |
| GS8 | Absolute | P | T | 43 | 90 | 2451.0 | 0.001630 |
| GS14 | Absolute | P | T | 43 | 90 | 2376.0 | 0.009930 |
| Gardnerella_vaginalis | Absolute | P | T | 43 | 90 | 2357.0 | 0.021300 |
| GS2 | Absolute | P | T | 43 | 90 | 2181.0 | 0.106000 |
| GS12 | Absolute | P | T | 43 | 90 | 2146.0 | 0.137000 |
| GS13 | Absolute | P | T | 43 | 90 | 2070.0 | 0.221000 |
| GS6 | Absolute | P | T | 43 | 90 | 2081.5 | 0.240000 |
| GS11 | Absolute | P | T | 43 | 90 | 2026.0 | 0.271000 |
| GS1 | Absolute | P | T | 43 | 90 | 2045.0 | 0.296000 |
| GS7 | Absolute | P | T | 43 | 90 | 2011.5 | 0.344000 |
| GS4 | Absolute | P | T | 43 | 90 | 1977.0 | 0.414000 |
| GS5 | Absolute | P | T | 43 | 90 | 1973.0 | 0.427000 |
| GS3 | Absolute | P | T | 43 | 90 | 1902.0 | 0.568000 |
genomoWilcoxResults <- genomoRelativeWilcoxResults %>%
full_join(genomoAbsoluteWilcoxResults, by = c("var", ".y.", "group1", "group2", "n1", "n2", "statistic", "p")) %>%
mutate(p.star=case_when(p>0.05~"",
p<0.05&p>0.01~"*",
p<0.01&p>0.001~"**",
p<0.001~"***"),
var=factor(var, levels = c(genomosCladeOrder, "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total")))
Relative Abundance Plot:
momspiGenomoAbundancePTBPlot <-gardGenomoAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
filter(Cohort=="MOMS-PI", !is.na(TermPre),
var!="Uncharacterized_Gardnerella") %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
mutate(var=factor(var, levels = c(genomosCladeOrder, "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total"))) %>%
ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
geom_text(data = genomoWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
binaryColors +
facet_grid(.~var) +
theme(legend.position = "none",
strip.text.x = element_text(size=8),
strip.text.y = element_text(size = 9),
axis.title=element_text(size=16)) +
labs(x=NULL, y="Relative\nAbundance")
momspiGenomoAbundancePTBPlot
Absolute Abundance Plot:
gardGenomoAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
filter(Cohort=="MOMS-PI", !is.na(TermPre), bacLoad<2,
var!="Uncharacterized_Gardnerella") %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
mutate(var=factor(var, levels = c(genomosCladeOrder, "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total"))) %>%
ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
geom_quasirandom(alpha=0.5) +
geom_boxplot(color="black", alpha=0) +
geom_text(data = genomoWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
binaryColors +
facet_grid(.~var) +
theme(legend.position = "none",
strip.text.x = element_text(size=8),
strip.text.y = element_text(size = 9),
axis.title=element_text(size=16)) +
labs(x=NULL, y="Absolute Abundance")
Mean clades per sample in term vs. preterm birth patients in MOMS-PI Only. Calculate Wilcoxon rank sum test
nCladesWilcox <- momsptbDF %>%
filter(Cohort=="MOMS-PI") %>%
spread(var, relativeAbundance) %>%
mutate_at(clades, ~case_when(.x>=0.001~1,
.x<0.001~0)) %>%
mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
wilcox_test(meanNClades~TermPre, alternative="greater")
nCladesWilcox %>%
formattable()
| .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|
| meanNClades | P | T | 43 | 90 | 2301.5 | 0.0389 |
Plot
momspiCladeCountPlot <- momsptbDF %>%
filter(Cohort=="MOMS-PI") %>%
spread(var, relativeAbundance) %>%
mutate_at(clades, ~case_when(.x>=0.001~1,
.x<0.001~0)) %>%
mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
ggplot(aes(x=TermPre, y=meanNClades, color=TermPre)) +
geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
geom_beeswarm(alpha=0.5, cex = 3, show.legend = FALSE) +
ylim(0,7) +
binaryColors +
labs(x=NULL, y="Mean Gestational\nClade Count") +
stat_pvalue_manual(nCladesWilcox, label="p", y.position = 6.7)
momspiCladeCountPlot
Perform Wilcoxon Rank Sum test
rarefiedNCladesWilcox <- rarefiedGardDF %>%
filter(Cohort=="MOMS-PI", !is.na(TermPre), SampleID %in% k100Samples) %>%
mutate_at(clades, ~case_when(.x>=0.001~1,
.x<0.001~0)) %>%
mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
wilcox_test(meanNClades~TermPre, alternative = "greater")
rarefiedNCladesWilcox %>%
formattable()
| .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|
| meanNClades | P | T | 42 | 89 | 2189.5 | 0.0571 |
Figure S9: Mean gestational clade richness vs. PTB after rarefying
rarefiedGardDF %>%
filter(Cohort=="MOMS-PI", !is.na(TermPre), SampleID %in% k100Samples) %>%
select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
gather("Clade", "Abundance", c(C1, C2, C3, C4, C5, C6)) %>%
mutate(Abundance=case_when(Abundance>=0.001~1,
Abundance<0.001~0)) %>%
with_groups(c(SampleID, SubjectID, Cohort, TermPre), summarize, nClades=sum(Abundance)) %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
ggplot(aes(x=TermPre, y=meanNClades, color=TermPre)) +
geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
geom_beeswarm(alpha=0.5, cex = 3, show.legend = FALSE) +
ylim(0,7) +
binaryColors +
labs(x=NULL, y="Mean Gestational\nClade Count") +
stat_pvalue_manual(rarefiedNCladesWilcox, label="p", y.position = 6.7)
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS9_rarefiedCladeRichnessvsPTB.png", sep="_")))
Perform Wilcoxon Rank Sum tests
bacloadWilcox <- momsptbDF %>%
filter(Cohort=="MOMS-PI") %>%
left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
filter(Cohort=="MOMS-PI", bacLoad<2) %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanBacLoad=mean(bacLoad)) %>%
wilcox_test(meanBacLoad~TermPre, alternative="greater")
bacloadWilcox %>%
formattable
| .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|
| meanBacLoad | P | T | 43 | 90 | 2436 | 0.00803 |
Plot results
momspiBacLoadPTBplot <- momsptbDF %>%
filter(Cohort=="MOMS-PI") %>%
left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
filter(Cohort=="MOMS-PI", bacLoad<2) %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanBacLoad=mean(bacLoad)) %>%
ggplot(aes(x=TermPre, color=TermPre, y=meanBacLoad)) +
geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
geom_beeswarm(alpha=0.5, cex = 2.1, show.legend = FALSE) +
ylim(0,0.75) +
binaryColors +
labs(x=NULL, y="Mean Gestational\nMicrobial Load") +
stat_pvalue_manual(bacloadWilcox, label="p", y.position = 0.7)
#fig.height=3, fig.width=5
momspiPTBAbundancePlots <- plot_grid(momspiCladeAbundancePTBPlot, momspiGenomoAbundancePTBPlot, ncol=1, labels = c("A", "B"), label_size = 15)
momspiCladeCountBacLoadPTBPlots <- plot_grid(momspiCladeCountPlot, momspiBacLoadPTBplot, nrow=1, labels = c("C", "D"), label_size = 15)
plot_grid(momspiPTBAbundancePlots, momspiCladeCountBacLoadPTBPlots, ncol=1, rel_heights = c(1, 0.6))
#ggsave(file.path(figureOut, paste(Sys.Date(), "Figure6_momspiPTB.png", sep="_")))
libSizeWilcox <- momsptbDF %>%
filter(Cohort=="MOMS-PI") %>%
left_join(sgDF[,c("SampleID", "Sickle_pairs")], by="SampleID") %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanLibSize=mean(Sickle_pairs)) %>%
wilcox_test(meanLibSize~TermPre, alternative="greater")
libSizeWilcox %>%
formattable
| .y. | group1 | group2 | n1 | n2 | statistic | p |
|---|---|---|---|---|---|---|
| meanLibSize | P | T | 43 | 90 | 2007 | 0.365 |
momsptbDF %>%
filter(Cohort=="MOMS-PI") %>%
left_join(sgDF[,c("SampleID", "Sickle_pairs")], by="SampleID") %>%
with_groups(c(SubjectID, Cohort, TermPre), summarize, meanLibSize=mean(Sickle_pairs)) %>%
ggplot(aes(x=TermPre, color=TermPre, y=meanLibSize)) +
geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
geom_beeswarm(alpha=0.5, cex = 2.1, show.legend = FALSE) +
binaryColors +
theme(axis.text = element_text(size=13),
strip.text = element_text(size=13),
axis.title = element_text(size=16)) +
labs(x=NULL, y="Mean Subject Library Size") +
stat_pvalue_manual(libSizeWilcox, label="p", y.position = 3.5e+07)
Subject average abundance by self reported race
gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "Race")], by="SampleID") %>%
group_by(SubjectID, Cohort, Race, var) %>%
summarise(meanRelativeAbundance=mean(relativeAbundance)) %>%
mutate(var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(clades, "Unch.", "Total")),
Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs),
Race=factor(Race, levels=c("Asian", "Black", "White", "Other", "Unknown"))) %>%
ggplot(aes(x=Race, y=meanRelativeAbundance, color=Race, fill=Race)) +
geom_point(alpha=0.5, position = position_jitterdodge()) +
geom_boxplot(color="black", alpha=0, outlier.shape = 0) +
facet_grid(Cohort~var) +
scale_x_discrete(labels=c("A", "B", "W", "O")) +
theme(legend.position = "bottom") +
labs(x=NULL, y="Relative Abundance")
#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS11_cladeRelativeAbundanceRace.png", sep="_")))
Subject average absolute abundance by race
gardCladeAbU %>%
left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "Race", "bacLoad")], by="SampleID") %>%
filter(bacLoad<2) %>%
mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
group_by(SubjectID, Cohort, Race, var) %>%
summarise(meanAbsoluteAbundance=mean(absoluteAbundance)) %>%
mutate(var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(clades, "Unch.", "Total")),
Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs),
Race=factor(Race, levels=c("Asian", "Black", "White", "Other", "Unknown"))) %>%
ggplot(aes(x=Race, y=meanAbsoluteAbundance, color=Race, fill=Race)) +
geom_point(alpha=0.5, position = position_jitterdodge()) +
geom_boxplot(color="black", alpha=0, outlier.shape = 0) +
facet_grid(Cohort~var) +
scale_x_discrete(labels=c("A", "B", "W", "O")) +
theme(legend.position = "bottom") +
labs(x=NULL, y="Absolute Abundance")
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggpubr_0.4.0 rstatix_0.7.0 cowplot_1.1.1 coin_1.4-2
## [5] survival_3.2-13 ggExtra_0.9 ggbeeswarm_0.6.0 vegan_2.5-7
## [9] lattice_0.20-45 permute_0.9-5 reshape2_1.4.4 formattable_0.2.1
## [13] broom_0.7.9 kableExtra_1.3.4 forcats_0.5.1 stringr_1.4.0
## [17] dplyr_1.0.7 purrr_0.3.4 readr_2.0.2 tidyr_1.1.4
## [21] tibble_3.1.5 ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.1-0 colorspace_2.0-2 ggsignif_0.6.3
## [4] ellipsis_0.3.2 modeltools_0.2-23 fs_1.5.0
## [7] rstudioapi_0.13 farver_2.1.0 bit64_4.0.5
## [10] fansi_0.5.0 mvtnorm_1.1-3 lubridate_1.8.0
## [13] xml2_1.3.2 codetools_0.2-18 splines_4.1.1
## [16] libcoin_1.0-9 knitr_1.36 jsonlite_1.7.2
## [19] cluster_2.1.2 dbplyr_2.1.1 shiny_1.7.1
## [22] compiler_4.1.1 httr_1.4.2 backports_1.2.1
## [25] assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0
## [28] cli_3.4.1 later_1.3.0 htmltools_0.5.2
## [31] tools_4.1.1 gtable_0.3.0 glue_1.4.2
## [34] Rcpp_1.0.7 carData_3.0-5 cellranger_1.1.0
## [37] jquerylib_0.1.4 vctrs_0.5.1 svglite_2.0.0
## [40] nlme_3.1-153 xfun_0.26 ps_1.6.0
## [43] rvest_1.0.1 mime_0.12 miniUI_0.1.1.1
## [46] lifecycle_1.0.3 MASS_7.3-54 zoo_1.8-10
## [49] scales_1.1.1 vroom_1.5.5 hms_1.1.1
## [52] promises_1.2.0.1 parallel_4.1.1 sandwich_3.0-1
## [55] yaml_2.2.1 sass_0.4.0 stringi_1.7.8
## [58] highr_0.9 rlang_1.0.6 pkgconfig_2.0.3
## [61] systemfonts_1.0.2 matrixStats_0.61.0 evaluate_0.14
## [64] labeling_0.4.2 htmlwidgets_1.5.4 processx_3.5.2
## [67] bit_4.0.4 tidyselect_1.1.1 plyr_1.8.6
## [70] magrittr_2.0.1 R6_2.5.1 magick_2.7.3
## [73] generics_0.1.0 multcomp_1.4-18 DBI_1.1.1
## [76] pillar_1.6.3 haven_2.4.3 withr_2.4.2
## [79] mgcv_1.8-38 abind_1.4-5 modelr_0.1.8
## [82] crayon_1.4.1 car_3.0-12 utf8_1.2.2
## [85] tzdb_0.1.2 rmarkdown_2.11 grid_4.1.1
## [88] readxl_1.3.1 callr_3.7.0 reprex_2.0.1
## [91] digest_0.6.28 webshot_0.5.2 xtable_1.8-4
## [94] httpuv_1.6.3 stats4_4.1.1 munsell_0.5.0
## [97] beeswarm_0.4.0 viridisLite_0.4.0 vipor_0.4.5
## [100] bslib_0.3.1